diff options
23 files changed, 1294 insertions, 1239 deletions
diff --git a/qmake/CMakeLists.txt b/qmake/CMakeLists.txt index d724b44d59..fa7d50f234 100644 --- a/qmake/CMakeLists.txt +++ b/qmake/CMakeLists.txt @@ -40,7 +40,6 @@ qt_add_tool(qmake # special case ../src/3rdparty/pcre2/src/pcre2_ucp.h ../src/3rdparty/pcre2/src/pcre2_valid_utf.c ../src/3rdparty/pcre2/src/pcre2_xclass.c - ../src/corelib/codecs/qutfcodec.cpp ../src/corelib/codecs/qutfcodec_p.h ../src/corelib/global/qendian.cpp # special case ../src/corelib/global/qglobal.cpp ../src/corelib/global/qglobal.h ../src/corelib/global/qlibraryinfo.cpp @@ -105,6 +104,7 @@ qt_add_tool(qmake # special case ../src/corelib/tools/qringbuffer.cpp # special case ../src/corelib/text/qstring.cpp ../src/corelib/text/qstring.h ../src/corelib/text/qstringbuilder.cpp ../src/corelib/text/qstringbuilder.h + ../src/corelib/text/qstringconverter.cpp ../src/corelib/text/qstringconverter.h ../src/corelib/text/qstringconverter_p.h ../src/corelib/text/qstringlist.cpp ../src/corelib/text/qstringlist.h ../src/corelib/text/qstringmatcher.h ../src/corelib/tools/qvector.h diff --git a/qmake/Makefile.unix b/qmake/Makefile.unix index 98d255f2d5..c0b6704351 100644 --- a/qmake/Makefile.unix +++ b/qmake/Makefile.unix @@ -17,7 +17,6 @@ OBJS = \ #qt code (please keep in order matching DEPEND_SRC) QOBJS = \ - qutfcodec.o \ qendian.o qglobal.o qlogging.o qmalloc.o qnumeric.o qoperatingsystemversion.o qrandom.o \ qabstractfileengine.o qbuffer.o qdatastream.o qdebug.o \ qdir.o qdiriterator.o \ @@ -32,7 +31,7 @@ QOBJS = \ qcalendar.o qgregoriancalendar.o qromancalendar.o \ qcryptographichash.o qdatetime.o qhash.o \ qlocale.o qlocale_tools.o qmap.o qregularexpression.o qregexp.o qringbuffer.o \ - qstringbuilder.o qstring.o qstringlist.o qversionnumber.o \ + qstringbuilder.o qstring.o qstringconverter.o qstringlist.o qversionnumber.o \ qvsnprintf.o qxmlstream.o qxmlutils.o \ pcre2_auto_possess.o pcre2_chartables.o pcre2_compile.o pcre2_config.o \ pcre2_context.o pcre2_dfa_match.o pcre2_error.o pcre2_extuni.o \ @@ -74,7 +73,6 @@ DEPEND_SRC = \ $(QMKGENSRC)/win32/msvc_vcxproj.cpp \ $(QMKGENSRC)/win32/winmakefile.cpp \ $(QMKGENSRC)/xmloutput.cpp \ - $(SOURCE_PATH)/src/corelib/codecs/qutfcodec.cpp \ $(SOURCE_PATH)/src/corelib/global/qendian.cpp \ $(SOURCE_PATH)/src/corelib/global/qglobal.cpp \ $(SOURCE_PATH)/src/corelib/global/qlibraryinfo.cpp \ @@ -122,6 +120,7 @@ DEPEND_SRC = \ $(SOURCE_PATH)/src/corelib/text/qregularexpression.cpp \ $(SOURCE_PATH)/src/corelib/text/qregexp.cpp \ $(SOURCE_PATH)/src/corelib/text/qstringbuilder.cpp \ + $(SOURCE_PATH)/src/corelib/text/qstringconverter.cpp \ $(SOURCE_PATH)/src/corelib/text/qstring.cpp \ $(SOURCE_PATH)/src/corelib/text/qstringlist.cpp \ $(SOURCE_PATH)/src/corelib/text/qvsnprintf.cpp \ @@ -380,15 +379,15 @@ qoperatingsystemversion_darwin.o: $(SOURCE_PATH)/src/corelib/global/qoperatingsy qcore_foundation.o: $(SOURCE_PATH)/src/corelib/kernel/qcore_foundation.mm $(CXX) -c -o $@ $(CXXFLAGS) $< -qutfcodec.o: $(SOURCE_PATH)/src/corelib/codecs/qutfcodec.cpp - $(CXX) -c -o $@ $(CXXFLAGS) $< - qstring.o: $(SOURCE_PATH)/src/corelib/text/qstring.cpp $(CXX) -c -o $@ $(CXXFLAGS) $< qstringbuilder.o: $(SOURCE_PATH)/src/corelib/text/qstringbuilder.cpp $(CXX) -c -o $@ $(CXXFLAGS) $< +qstringconverter.o: $(SOURCE_PATH)/src/corelib/text/qstringconverter.cpp + $(CXX) -c -o $@ $(CXXFLAGS) $< + qlocale.o: $(SOURCE_PATH)/src/corelib/text/qlocale.cpp $(CXX) -c -o $@ $(CXXFLAGS) $< diff --git a/qmake/Makefile.win32 b/qmake/Makefile.win32 index d3a85c17b2..df47dacd15 100644 --- a/qmake/Makefile.win32 +++ b/qmake/Makefile.win32 @@ -104,8 +104,8 @@ QTOBJS= \ qoperatingsystemversion_win.obj \ qregexp.obj \ qromancalendar.obj \ - qutfcodec.obj \ qstring.obj \ + qstringconverter.obj \ qstringlist.obj \ qstringbuilder.obj \ qsystemerror.obj \ diff --git a/qmake/qmake.pro b/qmake/qmake.pro index 243f07ac2c..fcd1c17dcf 100644 --- a/qmake/qmake.pro +++ b/qmake/qmake.pro @@ -159,11 +159,11 @@ SOURCES += \ qsettings.cpp \ qstring.cpp \ qstringbuilder.cpp \ + qstringconverter.cpp \ qstringlist.cpp \ qsystemerror.cpp \ qtemporaryfile.cpp \ qtextstream.cpp \ - qutfcodec.cpp \ quuid.cpp \ qvariant.cpp \ qversionnumber.cpp \ @@ -217,12 +217,13 @@ HEADERS += \ qromancalendar_p.h \ qstring.h \ qstringbuilder.h \ + qstringconverter_p.h \ + qstringconverter.h \ qstringlist.h \ qstringmatcher.h \ qsystemerror_p.h \ qtemporaryfile.h \ qtextstream.h \ - qutfcodec_p.h \ quuid.h \ qvector.h \ qversionnumber.h \ diff --git a/src/corelib/CMakeLists.txt b/src/corelib/CMakeLists.txt index ff28b2d20c..710d025caf 100644 --- a/src/corelib/CMakeLists.txt +++ b/src/corelib/CMakeLists.txt @@ -169,6 +169,7 @@ qt_add_module(Core text/qstring.cpp text/qstring.h text/qstring_compat.cpp text/qstringalgorithms.h text/qstringalgorithms_p.h + text/qstringconverter.cpp text/qstringconverter.h text/qstringconverter_p.h text/qstringbuilder.cpp text/qstringbuilder.h text/qstringiterator_p.h text/qstringlist.cpp text/qstringlist.h diff --git a/src/corelib/codecs/qutfcodec.cpp b/src/corelib/codecs/qutfcodec.cpp index a31bfbd218..c518ab1d9c 100644 --- a/src/corelib/codecs/qutfcodec.cpp +++ b/src/corelib/codecs/qutfcodec.cpp @@ -48,946 +48,6 @@ QT_BEGIN_NAMESPACE -enum { Endian = 0, Data = 1 }; - -static const uchar utf8bom[] = { 0xef, 0xbb, 0xbf }; - -#if (defined(__SSE2__) && defined(QT_COMPILER_SUPPORTS_SSE2)) \ - || (defined(__ARM_NEON__) && defined(Q_PROCESSOR_ARM_64)) -static Q_ALWAYS_INLINE uint qBitScanReverse(unsigned v) noexcept -{ - uint result = qCountLeadingZeroBits(v); - // Now Invert the result: clz will count *down* from the msb to the lsb, so the msb index is 31 - // and the lsb index is 0. The result for _bit_scan_reverse is expected to be the index when - // counting up: msb index is 0 (because it starts there), and the lsb index is 31. - result ^= sizeof(unsigned) * 8 - 1; - return result; -} -#endif - -#if defined(__SSE2__) && defined(QT_COMPILER_SUPPORTS_SSE2) -static inline bool simdEncodeAscii(uchar *&dst, const ushort *&nextAscii, const ushort *&src, const ushort *end) -{ - // do sixteen characters at a time - for ( ; end - src >= 16; src += 16, dst += 16) { -# ifdef __AVX2__ - __m256i data = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src)); - __m128i data1 = _mm256_castsi256_si128(data); - __m128i data2 = _mm256_extracti128_si256(data, 1); -# else - __m128i data1 = _mm_loadu_si128((const __m128i*)src); - __m128i data2 = _mm_loadu_si128(1+(const __m128i*)src); -# endif - - // check if everything is ASCII - // the highest ASCII value is U+007F - // Do the packing directly: - // The PACKUSWB instruction has packs a signed 16-bit integer to an unsigned 8-bit - // with saturation. That is, anything from 0x0100 to 0x7fff is saturated to 0xff, - // while all negatives (0x8000 to 0xffff) get saturated to 0x00. To detect non-ASCII, - // we simply do a signed greater-than comparison to 0x00. That means we detect NULs as - // "non-ASCII", but it's an acceptable compromise. - __m128i packed = _mm_packus_epi16(data1, data2); - __m128i nonAscii = _mm_cmpgt_epi8(packed, _mm_setzero_si128()); - - // store, even if there are non-ASCII characters here - _mm_storeu_si128((__m128i*)dst, packed); - - // n will contain 1 bit set per character in [data1, data2] that is non-ASCII (or NUL) - ushort n = ~_mm_movemask_epi8(nonAscii); - if (n) { - // find the next probable ASCII character - // we don't want to load 32 bytes again in this loop if we know there are non-ASCII - // characters still coming - nextAscii = src + qBitScanReverse(n) + 1; - - n = qCountTrailingZeroBits(n); - dst += n; - src += n; - return false; - } - } - - if (end - src >= 8) { - // do eight characters at a time - __m128i data = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src)); - __m128i packed = _mm_packus_epi16(data, data); - __m128i nonAscii = _mm_cmpgt_epi8(packed, _mm_setzero_si128()); - - // store even non-ASCII - _mm_storel_epi64(reinterpret_cast<__m128i *>(dst), packed); - - uchar n = ~_mm_movemask_epi8(nonAscii); - if (n) { - nextAscii = src + qBitScanReverse(n) + 1; - n = qCountTrailingZeroBits(n); - dst += n; - src += n; - return false; - } - } - - return src == end; -} - -static inline bool simdDecodeAscii(ushort *&dst, const uchar *&nextAscii, const uchar *&src, const uchar *end) -{ - // do sixteen characters at a time - for ( ; end - src >= 16; src += 16, dst += 16) { - __m128i data = _mm_loadu_si128((const __m128i*)src); - -#ifdef __AVX2__ - const int BitSpacing = 2; - // load and zero extend to an YMM register - const __m256i extended = _mm256_cvtepu8_epi16(data); - - uint n = _mm256_movemask_epi8(extended); - if (!n) { - // store - _mm256_storeu_si256((__m256i*)dst, extended); - continue; - } -#else - const int BitSpacing = 1; - - // check if everything is ASCII - // movemask extracts the high bit of every byte, so n is non-zero if something isn't ASCII - uint n = _mm_movemask_epi8(data); - if (!n) { - // unpack - _mm_storeu_si128((__m128i*)dst, _mm_unpacklo_epi8(data, _mm_setzero_si128())); - _mm_storeu_si128(1+(__m128i*)dst, _mm_unpackhi_epi8(data, _mm_setzero_si128())); - continue; - } -#endif - - // copy the front part that is still ASCII - while (!(n & 1)) { - *dst++ = *src++; - n >>= BitSpacing; - } - - // find the next probable ASCII character - // we don't want to load 16 bytes again in this loop if we know there are non-ASCII - // characters still coming - n = qBitScanReverse(n); - nextAscii = src + (n / BitSpacing) + 1; - return false; - - } - - if (end - src >= 8) { - __m128i data = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(src)); - uint n = _mm_movemask_epi8(data) & 0xff; - if (!n) { - // unpack and store - _mm_storeu_si128(reinterpret_cast<__m128i *>(dst), _mm_unpacklo_epi8(data, _mm_setzero_si128())); - } else { - while (!(n & 1)) { - *dst++ = *src++; - n >>= 1; - } - - n = qBitScanReverse(n); - nextAscii = src + n + 1; - return false; - } - } - - return src == end; -} - -static inline const uchar *simdFindNonAscii(const uchar *src, const uchar *end, const uchar *&nextAscii) -{ -#ifdef __AVX2__ - // do 32 characters at a time - // (this is similar to simdTestMask in qstring.cpp) - const __m256i mask = _mm256_set1_epi8(0x80); - for ( ; end - src >= 32; src += 32) { - __m256i data = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src)); - if (_mm256_testz_si256(mask, data)) - continue; - - uint n = _mm256_movemask_epi8(data); - Q_ASSUME(n); - - // find the next probable ASCII character - // we don't want to load 32 bytes again in this loop if we know there are non-ASCII - // characters still coming - nextAscii = src + qBitScanReverse(n) + 1; - - // return the non-ASCII character - return src + qCountTrailingZeroBits(n); - } -#endif - - // do sixteen characters at a time - for ( ; end - src >= 16; src += 16) { - __m128i data = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src)); - - // check if everything is ASCII - // movemask extracts the high bit of every byte, so n is non-zero if something isn't ASCII - uint n = _mm_movemask_epi8(data); - if (!n) - continue; - - // find the next probable ASCII character - // we don't want to load 16 bytes again in this loop if we know there are non-ASCII - // characters still coming - nextAscii = src + qBitScanReverse(n) + 1; - - // return the non-ASCII character - return src + qCountTrailingZeroBits(n); - } - - // do four characters at a time - for ( ; end - src >= 4; src += 4) { - quint32 data = qFromUnaligned<quint32>(src); - data &= 0x80808080U; - if (!data) - continue; - - // We don't try to guess which of the three bytes is ASCII and which - // one isn't. The chance that at least two of them are non-ASCII is - // better than 75%. - nextAscii = src; - return src; - } - nextAscii = end; - return src; -} -#elif defined(__ARM_NEON__) && defined(Q_PROCESSOR_ARM_64) // vaddv is only available on Aarch64 -static inline bool simdEncodeAscii(uchar *&dst, const ushort *&nextAscii, const ushort *&src, const ushort *end) -{ - uint16x8_t maxAscii = vdupq_n_u16(0x7f); - uint16x8_t mask1 = { 1, 1 << 2, 1 << 4, 1 << 6, 1 << 8, 1 << 10, 1 << 12, 1 << 14 }; - uint16x8_t mask2 = vshlq_n_u16(mask1, 1); - - // do sixteen characters at a time - for ( ; end - src >= 16; src += 16, dst += 16) { - // load 2 lanes (or: "load interleaved") - uint16x8x2_t in = vld2q_u16(src); - - // check if any of the elements > 0x7f, select 1 bit per element (element 0 -> bit 0, element 1 -> bit 1, etc), - // add those together into a scalar, and merge the scalars. - uint16_t nonAscii = vaddvq_u16(vandq_u16(vcgtq_u16(in.val[0], maxAscii), mask1)) - | vaddvq_u16(vandq_u16(vcgtq_u16(in.val[1], maxAscii), mask2)); - - // merge the two lanes by shifting the values of the second by 8 and inserting them - uint16x8_t out = vsliq_n_u16(in.val[0], in.val[1], 8); - - // store, even if there are non-ASCII characters here - vst1q_u8(dst, vreinterpretq_u8_u16(out)); - - if (nonAscii) { - // find the next probable ASCII character - // we don't want to load 32 bytes again in this loop if we know there are non-ASCII - // characters still coming - nextAscii = src + qBitScanReverse(nonAscii) + 1; - - nonAscii = qCountTrailingZeroBits(nonAscii); - dst += nonAscii; - src += nonAscii; - return false; - } - } - return src == end; -} - -static inline bool simdDecodeAscii(ushort *&dst, const uchar *&nextAscii, const uchar *&src, const uchar *end) -{ - // do eight characters at a time - uint8x8_t msb_mask = vdup_n_u8(0x80); - uint8x8_t add_mask = { 1, 1 << 1, 1 << 2, 1 << 3, 1 << 4, 1 << 5, 1 << 6, 1 << 7 }; - for ( ; end - src >= 8; src += 8, dst += 8) { - uint8x8_t c = vld1_u8(src); - uint8_t n = vaddv_u8(vand_u8(vcge_u8(c, msb_mask), add_mask)); - if (!n) { - // store - vst1q_u16(dst, vmovl_u8(c)); - continue; - } - - // copy the front part that is still ASCII - while (!(n & 1)) { - *dst++ = *src++; - n >>= 1; - } - - // find the next probable ASCII character - // we don't want to load 16 bytes again in this loop if we know there are non-ASCII - // characters still coming - n = qBitScanReverse(n); - nextAscii = src + n + 1; - return false; - - } - return src == end; -} - -static inline const uchar *simdFindNonAscii(const uchar *src, const uchar *end, const uchar *&nextAscii) -{ - // The SIMD code below is untested, so just force an early return until - // we've had the time to verify it works. - nextAscii = end; - return src; - - // do eight characters at a time - uint8x8_t msb_mask = vdup_n_u8(0x80); - uint8x8_t add_mask = { 1, 1 << 1, 1 << 2, 1 << 3, 1 << 4, 1 << 5, 1 << 6, 1 << 7 }; - for ( ; end - src >= 8; src += 8) { - uint8x8_t c = vld1_u8(src); - uint8_t n = vaddv_u8(vand_u8(vcge_u8(c, msb_mask), add_mask)); - if (!n) - continue; - - // find the next probable ASCII character - // we don't want to load 16 bytes again in this loop if we know there are non-ASCII - // characters still coming - nextAscii = src + qBitScanReverse(n) + 1; - - // return the non-ASCII character - return src + qCountTrailingZeroBits(n); - } - nextAscii = end; - return src; -} -#else -static inline bool simdEncodeAscii(uchar *, const ushort *, const ushort *, const ushort *) -{ - return false; -} - -static inline bool simdDecodeAscii(ushort *, const uchar *, const uchar *, const uchar *) -{ - return false; -} - -static inline const uchar *simdFindNonAscii(const uchar *src, const uchar *end, const uchar *&nextAscii) -{ - nextAscii = end; - return src; -} -#endif - -QByteArray QUtf8::convertFromUnicode(const QChar *uc, int len) -{ - // create a QByteArray with the worst case scenario size - QByteArray result(len * 3, Qt::Uninitialized); - uchar *dst = reinterpret_cast<uchar *>(const_cast<char *>(result.constData())); - const ushort *src = reinterpret_cast<const ushort *>(uc); - const ushort *const end = src + len; - - while (src != end) { - const ushort *nextAscii = end; - if (simdEncodeAscii(dst, nextAscii, src, end)) - break; - - do { - ushort uc = *src++; - int res = QUtf8Functions::toUtf8<QUtf8BaseTraits>(uc, dst, src, end); - if (res < 0) { - // encoding error - append '?' - *dst++ = '?'; - } - } while (src < nextAscii); - } - - result.truncate(dst - reinterpret_cast<uchar *>(const_cast<char *>(result.constData()))); - return result; -} - -QByteArray QUtf8::convertFromUnicode(const QChar *uc, int len, QTextCodec::ConverterState *state) -{ - uchar replacement = '?'; - int rlen = 3*len; - int surrogate_high = -1; - if (state) { - if (state->flags & QTextCodec::ConvertInvalidToNull) - replacement = 0; - if (!(state->flags & QTextCodec::IgnoreHeader)) - rlen += 3; - if (state->remainingChars) - surrogate_high = state->state_data[0]; - } - - - QByteArray rstr(rlen, Qt::Uninitialized); - uchar *cursor = reinterpret_cast<uchar *>(const_cast<char *>(rstr.constData())); - const ushort *src = reinterpret_cast<const ushort *>(uc); - const ushort *const end = src + len; - - int invalid = 0; - if (state && !(state->flags & QTextCodec::IgnoreHeader)) { - // append UTF-8 BOM - *cursor++ = utf8bom[0]; - *cursor++ = utf8bom[1]; - *cursor++ = utf8bom[2]; - } - - const ushort *nextAscii = src; - while (src != end) { - int res; - ushort uc; - if (surrogate_high != -1) { - uc = surrogate_high; - surrogate_high = -1; - res = QUtf8Functions::toUtf8<QUtf8BaseTraits>(uc, cursor, src, end); - } else { - if (src >= nextAscii && simdEncodeAscii(cursor, nextAscii, src, end)) - break; - - uc = *src++; - res = QUtf8Functions::toUtf8<QUtf8BaseTraits>(uc, cursor, src, end); - } - if (Q_LIKELY(res >= 0)) - continue; - - if (res == QUtf8BaseTraits::Error) { - // encoding error - ++invalid; - *cursor++ = replacement; - } else if (res == QUtf8BaseTraits::EndOfString) { - surrogate_high = uc; - break; - } - } - - rstr.resize(cursor - (const uchar*)rstr.constData()); - if (state) { - state->invalidChars += invalid; - state->flags |= QTextCodec::IgnoreHeader; - state->remainingChars = 0; - if (surrogate_high >= 0) { - state->remainingChars = 1; - state->state_data[0] = surrogate_high; - } - } - return rstr; -} - -QString QUtf8::convertToUnicode(const char *chars, int len) -{ - // UTF-8 to UTF-16 always needs the exact same number of words or less: - // UTF-8 UTF-16 - // 1 byte 1 word - // 2 bytes 1 word - // 3 bytes 1 word - // 4 bytes 2 words (one surrogate pair) - // That is, we'll use the full buffer if the input is US-ASCII (1-byte UTF-8), - // half the buffer for U+0080-U+07FF text (e.g., Greek, Cyrillic, Arabic) or - // non-BMP text, and one third of the buffer for U+0800-U+FFFF text (e.g, CJK). - // - // The table holds for invalid sequences too: we'll insert one replacement char - // per invalid byte. - QString result(len, Qt::Uninitialized); - QChar *data = const_cast<QChar*>(result.constData()); // we know we're not shared - const QChar *end = convertToUnicode(data, chars, len); - result.truncate(end - data); - return result; -} - -/*! - \since 5.7 - \overload - - Converts the UTF-8 sequence of \a len octets beginning at \a chars to - a sequence of QChar starting at \a buffer. The buffer is expected to be - large enough to hold the result. An upper bound for the size of the - buffer is \a len QChars. - - If, during decoding, an error occurs, a QChar::ReplacementCharacter is - written. - - Returns a pointer to one past the last QChar written. - - This function never throws. -*/ - -QChar *QUtf8::convertToUnicode(QChar *buffer, const char *chars, int len) noexcept -{ - ushort *dst = reinterpret_cast<ushort *>(buffer); - const uchar *src = reinterpret_cast<const uchar *>(chars); - const uchar *end = src + len; - - // attempt to do a full decoding in SIMD - const uchar *nextAscii = end; - if (!simdDecodeAscii(dst, nextAscii, src, end)) { - // at least one non-ASCII entry - // check if we failed to decode the UTF-8 BOM; if so, skip it - if (Q_UNLIKELY(src == reinterpret_cast<const uchar *>(chars)) - && end - src >= 3 - && Q_UNLIKELY(src[0] == utf8bom[0] && src[1] == utf8bom[1] && src[2] == utf8bom[2])) { - src += 3; - } - - while (src < end) { - nextAscii = end; - if (simdDecodeAscii(dst, nextAscii, src, end)) - break; - - do { - uchar b = *src++; - int res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(b, dst, src, end); - if (res < 0) { - // decoding error - *dst++ = QChar::ReplacementCharacter; - } - } while (src < nextAscii); - } - } - - return reinterpret_cast<QChar *>(dst); -} - -QString QUtf8::convertToUnicode(const char *chars, int len, QTextCodec::ConverterState *state) -{ - bool headerdone = false; - ushort replacement = QChar::ReplacementCharacter; - int invalid = 0; - int res; - uchar ch = 0; - - // See above for buffer requirements for stateless decoding. However, that - // fails if the state is not empty. The following situations can add to the - // requirements: - // state contains chars starts with requirement - // 1 of 2 bytes valid continuation 0 - // 2 of 3 bytes same 0 - // 3 bytes of 4 same +1 (need to insert surrogate pair) - // 1 of 2 bytes invalid continuation +1 (need to insert replacement and restart) - // 2 of 3 bytes same +1 (same) - // 3 of 4 bytes same +1 (same) - QString result(len + 1, Qt::Uninitialized); - - ushort *dst = reinterpret_cast<ushort *>(const_cast<QChar *>(result.constData())); - const uchar *src = reinterpret_cast<const uchar *>(chars); - const uchar *end = src + len; - - if (state) { - if (state->flags & QTextCodec::IgnoreHeader) - headerdone = true; - if (state->flags & QTextCodec::ConvertInvalidToNull) - replacement = QChar::Null; - if (state->remainingChars) { - // handle incoming state first - uchar remainingCharsData[4]; // longest UTF-8 sequence possible - int remainingCharsCount = state->remainingChars; - int newCharsToCopy = qMin<int>(sizeof(remainingCharsData) - remainingCharsCount, end - src); - - memset(remainingCharsData, 0, sizeof(remainingCharsData)); - memcpy(remainingCharsData, &state->state_data[0], remainingCharsCount); - memcpy(remainingCharsData + remainingCharsCount, src, newCharsToCopy); - - const uchar *begin = &remainingCharsData[1]; - res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(remainingCharsData[0], dst, begin, - static_cast<const uchar *>(remainingCharsData) + remainingCharsCount + newCharsToCopy); - if (res == QUtf8BaseTraits::Error || (res == QUtf8BaseTraits::EndOfString && len == 0)) { - // special case for len == 0: - // if we were supplied an empty string, terminate the previous, unfinished sequence with error - ++invalid; - *dst++ = replacement; - } else if (res == QUtf8BaseTraits::EndOfString) { - // if we got EndOfString again, then there were too few bytes in src; - // copy to our state and return - state->remainingChars = remainingCharsCount + newCharsToCopy; - memcpy(&state->state_data[0], remainingCharsData, state->remainingChars); - return QString(); - } else if (!headerdone && res >= 0) { - // eat the UTF-8 BOM - headerdone = true; - if (dst[-1] == 0xfeff) - --dst; - } - - // adjust src now that we have maybe consumed a few chars - if (res >= 0) { - Q_ASSERT(res > remainingCharsCount); - src += res - remainingCharsCount; - } - } - } - - // main body, stateless decoding - res = 0; - const uchar *nextAscii = src; - const uchar *start = src; - while (res >= 0 && src < end) { - if (src >= nextAscii && simdDecodeAscii(dst, nextAscii, src, end)) - break; - - ch = *src++; - res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(ch, dst, src, end); - if (!headerdone && res >= 0) { - headerdone = true; - if (src == start + 3) { // 3 == sizeof(utf8-bom) - // eat the UTF-8 BOM (it can only appear at the beginning of the string). - if (dst[-1] == 0xfeff) - --dst; - } - } - if (res == QUtf8BaseTraits::Error) { - res = 0; - ++invalid; - *dst++ = replacement; - } - } - - if (!state && res == QUtf8BaseTraits::EndOfString) { - // unterminated UTF sequence - *dst++ = QChar::ReplacementCharacter; - while (src++ < end) - *dst++ = QChar::ReplacementCharacter; - } - - result.truncate(dst - (const ushort *)result.unicode()); - if (state) { - state->invalidChars += invalid; - if (headerdone) - state->flags |= QTextCodec::IgnoreHeader; - if (res == QUtf8BaseTraits::EndOfString) { - --src; // unread the byte in ch - state->remainingChars = end - src; - memcpy(&state->state_data[0], src, end - src); - } else { - state->remainingChars = 0; - } - } - return result; -} - -struct QUtf8NoOutputTraits : public QUtf8BaseTraitsNoAscii -{ - struct NoOutput {}; - static void appendUtf16(const NoOutput &, ushort) {} - static void appendUcs4(const NoOutput &, uint) {} -}; - -QUtf8::ValidUtf8Result QUtf8::isValidUtf8(const char *chars, qsizetype len) -{ - const uchar *src = reinterpret_cast<const uchar *>(chars); - const uchar *end = src + len; - const uchar *nextAscii = src; - bool isValidAscii = true; - - while (src < end) { - if (src >= nextAscii) - src = simdFindNonAscii(src, end, nextAscii); - if (src == end) - break; - - do { - uchar b = *src++; - if ((b & 0x80) == 0) - continue; - - isValidAscii = false; - QUtf8NoOutputTraits::NoOutput output; - int res = QUtf8Functions::fromUtf8<QUtf8NoOutputTraits>(b, output, src, end); - if (res < 0) { - // decoding error - return { false, false }; - } - } while (src < nextAscii); - } - - return { true, isValidAscii }; -} - -int QUtf8::compareUtf8(const char *utf8, qsizetype u8len, const QChar *utf16, int u16len) -{ - uint uc1, uc2; - auto src1 = reinterpret_cast<const uchar *>(utf8); - auto end1 = src1 + u8len; - QStringIterator src2(utf16, utf16 + u16len); - - while (src1 < end1 && src2.hasNext()) { - uchar b = *src1++; - uint *output = &uc1; - int res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(b, output, src1, end1); - if (res < 0) { - // decoding error - uc1 = QChar::ReplacementCharacter; - } - - uc2 = src2.next(); - if (uc1 != uc2) - return int(uc1) - int(uc2); - } - - // the shorter string sorts first - return (end1 > src1) - int(src2.hasNext()); -} - -int QUtf8::compareUtf8(const char *utf8, qsizetype u8len, QLatin1String s) -{ - uint uc1; - auto src1 = reinterpret_cast<const uchar *>(utf8); - auto end1 = src1 + u8len; - auto src2 = reinterpret_cast<const uchar *>(s.latin1()); - auto end2 = src2 + s.size(); - - while (src1 < end1 && src2 < end2) { - uchar b = *src1++; - uint *output = &uc1; - int res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(b, output, src1, end1); - if (res < 0) { - // decoding error - uc1 = QChar::ReplacementCharacter; - } - - uint uc2 = *src2++; - if (uc1 != uc2) - return int(uc1) - int(uc2); - } - - // the shorter string sorts first - return (end1 > src1) - (end2 > src2); -} - -QByteArray QUtf16::convertFromUnicode(const QChar *uc, int len, QTextCodec::ConverterState *state, DataEndianness e) -{ - DataEndianness endian = e; - int length = 2*len; - if (!state || (!(state->flags & QTextCodec::IgnoreHeader))) { - length += 2; - } - if (e == DetectEndianness) { - endian = (QSysInfo::ByteOrder == QSysInfo::BigEndian) ? BigEndianness : LittleEndianness; - } - - QByteArray d; - d.resize(length); - char *data = d.data(); - if (!state || !(state->flags & QTextCodec::IgnoreHeader)) { - QChar bom(QChar::ByteOrderMark); - if (endian == BigEndianness) - qToBigEndian(bom.unicode(), data); - else - qToLittleEndian(bom.unicode(), data); - data += 2; - } - if (endian == BigEndianness) - qToBigEndian<ushort>(uc, len, data); - else - qToLittleEndian<ushort>(uc, len, data); - - if (state) { - state->remainingChars = 0; - state->flags |= QTextCodec::IgnoreHeader; - } - return d; -} - -QString QUtf16::convertToUnicode(const char *chars, int len, QTextCodec::ConverterState *state, DataEndianness e) -{ - DataEndianness endian = e; - bool half = false; - uchar buf = 0; - bool headerdone = false; - if (state) { - headerdone = state->flags & QTextCodec::IgnoreHeader; - if (endian == DetectEndianness) - endian = (DataEndianness)state->state_data[Endian]; - if (state->remainingChars) { - half = true; - buf = state->state_data[Data]; - } - } - if (headerdone && endian == DetectEndianness) - endian = (QSysInfo::ByteOrder == QSysInfo::BigEndian) ? BigEndianness : LittleEndianness; - - QString result(len, Qt::Uninitialized); // worst case - QChar *qch = (QChar *)result.data(); - while (len--) { - if (half) { - QChar ch; - if (endian == LittleEndianness) { - ch.setRow(*chars++); - ch.setCell(buf); - } else { - ch.setRow(buf); - ch.setCell(*chars++); - } - if (!headerdone) { - headerdone = true; - if (endian == DetectEndianness) { - if (ch == QChar::ByteOrderSwapped) { - endian = LittleEndianness; - } else if (ch == QChar::ByteOrderMark) { - endian = BigEndianness; - } else { - if (QSysInfo::ByteOrder == QSysInfo::BigEndian) { - endian = BigEndianness; - } else { - endian = LittleEndianness; - ch = QChar::fromUcs2((ch.unicode() >> 8) | ((ch.unicode() & 0xff) << 8)); - } - *qch++ = ch; - } - } else if (ch != QChar::ByteOrderMark) { - *qch++ = ch; - } - } else { - *qch++ = ch; - } - half = false; - } else { - buf = *chars++; - half = true; - } - } - result.truncate(qch - result.unicode()); - - if (state) { - if (headerdone) - state->flags |= QTextCodec::IgnoreHeader; - state->state_data[Endian] = endian; - if (half) { - state->remainingChars = 1; - state->state_data[Data] = buf; - } else { - state->remainingChars = 0; - state->state_data[Data] = 0; - } - } - return result; -} - -QByteArray QUtf32::convertFromUnicode(const QChar *uc, int len, QTextCodec::ConverterState *state, DataEndianness e) -{ - DataEndianness endian = e; - int length = 4*len; - if (!state || (!(state->flags & QTextCodec::IgnoreHeader))) { - length += 4; - } - if (e == DetectEndianness) { - endian = (QSysInfo::ByteOrder == QSysInfo::BigEndian) ? BigEndianness : LittleEndianness; - } - - QByteArray d(length, Qt::Uninitialized); - char *data = d.data(); - if (!state || !(state->flags & QTextCodec::IgnoreHeader)) { - if (endian == BigEndianness) { - data[0] = 0; - data[1] = 0; - data[2] = (char)0xfe; - data[3] = (char)0xff; - } else { - data[0] = (char)0xff; - data[1] = (char)0xfe; - data[2] = 0; - data[3] = 0; - } - data += 4; - } - - QStringIterator i(uc, uc + len); - if (endian == BigEndianness) { - while (i.hasNext()) { - uint cp = i.next(); - qToBigEndian(cp, data); - data += 4; - } - } else { - while (i.hasNext()) { - uint cp = i.next(); - qToLittleEndian(cp, data); - data += 4; - } - } - - if (state) { - state->remainingChars = 0; - state->flags |= QTextCodec::IgnoreHeader; - } - return d; -} - -QString QUtf32::convertToUnicode(const char *chars, int len, QTextCodec::ConverterState *state, DataEndianness e) -{ - DataEndianness endian = e; - uchar tuple[4]; - int num = 0; - bool headerdone = false; - if (state) { - headerdone = state->flags & QTextCodec::IgnoreHeader; - if (endian == DetectEndianness) { - endian = (DataEndianness)state->state_data[Endian]; - } - num = state->remainingChars; - memcpy(tuple, &state->state_data[Data], 4); - } - if (headerdone && endian == DetectEndianness) - endian = (QSysInfo::ByteOrder == QSysInfo::BigEndian) ? BigEndianness : LittleEndianness; - - QString result; - result.resize((num + len) >> 2 << 1); // worst case - QChar *qch = (QChar *)result.data(); - - const char *end = chars + len; - while (chars < end) { - tuple[num++] = *chars++; - if (num == 4) { - if (!headerdone) { - headerdone = true; - if (endian == DetectEndianness) { - if (tuple[0] == 0xff && tuple[1] == 0xfe && tuple[2] == 0 && tuple[3] == 0 && endian != BigEndianness) { - endian = LittleEndianness; - num = 0; - continue; - } else if (tuple[0] == 0 && tuple[1] == 0 && tuple[2] == 0xfe && tuple[3] == 0xff && endian != LittleEndianness) { - endian = BigEndianness; - num = 0; - continue; - } else if (QSysInfo::ByteOrder == QSysInfo::BigEndian) { - endian = BigEndianness; - } else { - endian = LittleEndianness; - } - } else if (((endian == BigEndianness) ? qFromBigEndian<quint32>(tuple) : qFromLittleEndian<quint32>(tuple)) == QChar::ByteOrderMark) { - num = 0; - continue; - } - } - uint code = (endian == BigEndianness) ? qFromBigEndian<quint32>(tuple) : qFromLittleEndian<quint32>(tuple); - for (char16_t c : QChar::fromUcs4(code)) - *qch++ = c; - num = 0; - } - } - result.truncate(qch - result.unicode()); - - if (state) { - if (headerdone) - state->flags |= QTextCodec::IgnoreHeader; - state->state_data[Endian] = endian; - state->remainingChars = num; - memcpy(&state->state_data[Data], tuple, 4); - } - return result; -} - -QString qFromUtfEncoded(const QByteArray &ba) -{ - const int arraySize = ba.size(); - const uchar *buf = reinterpret_cast<const uchar *>(ba.constData()); - const uint bom = 0xfeff; - - if (arraySize > 3) { - uint uc = qFromUnaligned<uint>(buf); - if (uc == qToBigEndian(bom) || uc == qToLittleEndian(bom)) - return QUtf32::convertToUnicode(ba.constData(), ba.length(), nullptr); // utf-32 - } - - if (arraySize > 1) { - ushort uc = qFromUnaligned<ushort>(buf); - if (uc == qToBigEndian(ushort(bom)) || qToLittleEndian(ushort(bom))) - return QUtf16::convertToUnicode(ba.constData(), ba.length(), nullptr); // utf-16 - } - return QUtf8::convertToUnicode(ba.constData(), ba.length()); -} - #if QT_CONFIG(textcodec) QUtf8Codec::~QUtf8Codec() diff --git a/src/corelib/codecs/qutfcodec_p.h b/src/corelib/codecs/qutfcodec_p.h index b1c7a23d4f..893a6db8e1 100644 --- a/src/corelib/codecs/qutfcodec_p.h +++ b/src/corelib/codecs/qutfcodec_p.h @@ -60,271 +60,11 @@ #include "QtCore/qtextcodec.h" #endif +#include "private/qstringconverter_p.h" #include "private/qtextcodec_p.h" QT_BEGIN_NAMESPACE -struct QUtf8BaseTraits -{ - static const bool isTrusted = false; - static const bool allowNonCharacters = true; - static const bool skipAsciiHandling = false; - static const int Error = -1; - static const int EndOfString = -2; - - static bool isValidCharacter(uint u) - { return int(u) >= 0; } - - static void appendByte(uchar *&ptr, uchar b) - { *ptr++ = b; } - - static uchar peekByte(const uchar *ptr, int n = 0) - { return ptr[n]; } - - static qptrdiff availableBytes(const uchar *ptr, const uchar *end) - { return end - ptr; } - - static void advanceByte(const uchar *&ptr, int n = 1) - { ptr += n; } - - static void appendUtf16(ushort *&ptr, ushort uc) - { *ptr++ = uc; } - - static void appendUcs4(ushort *&ptr, uint uc) - { - appendUtf16(ptr, QChar::highSurrogate(uc)); - appendUtf16(ptr, QChar::lowSurrogate(uc)); - } - - static ushort peekUtf16(const ushort *ptr, int n = 0) - { return ptr[n]; } - - static qptrdiff availableUtf16(const ushort *ptr, const ushort *end) - { return end - ptr; } - - static void advanceUtf16(const ushort *&ptr, int n = 1) - { ptr += n; } - - // it's possible to output to UCS-4 too - static void appendUtf16(uint *&ptr, ushort uc) - { *ptr++ = uc; } - - static void appendUcs4(uint *&ptr, uint uc) - { *ptr++ = uc; } -}; - -struct QUtf8BaseTraitsNoAscii : public QUtf8BaseTraits -{ - static const bool skipAsciiHandling = true; -}; - -namespace QUtf8Functions -{ - /// returns 0 on success; errors can only happen if \a u is a surrogate: - /// Error if \a u is a low surrogate; - /// if \a u is a high surrogate, Error if the next isn't a low one, - /// EndOfString if we run into the end of the string. - template <typename Traits, typename OutputPtr, typename InputPtr> inline - int toUtf8(ushort u, OutputPtr &dst, InputPtr &src, InputPtr end) - { - if (!Traits::skipAsciiHandling && u < 0x80) { - // U+0000 to U+007F (US-ASCII) - one byte - Traits::appendByte(dst, uchar(u)); - return 0; - } else if (u < 0x0800) { - // U+0080 to U+07FF - two bytes - // first of two bytes - Traits::appendByte(dst, 0xc0 | uchar(u >> 6)); - } else { - if (!QChar::isSurrogate(u)) { - // U+0800 to U+FFFF (except U+D800-U+DFFF) - three bytes - if (!Traits::allowNonCharacters && QChar::isNonCharacter(u)) - return Traits::Error; - - // first of three bytes - Traits::appendByte(dst, 0xe0 | uchar(u >> 12)); - } else { - // U+10000 to U+10FFFF - four bytes - // need to get one extra codepoint - if (Traits::availableUtf16(src, end) == 0) - return Traits::EndOfString; - - ushort low = Traits::peekUtf16(src); - if (!QChar::isHighSurrogate(u)) - return Traits::Error; - if (!QChar::isLowSurrogate(low)) - return Traits::Error; - - Traits::advanceUtf16(src); - uint ucs4 = QChar::surrogateToUcs4(u, low); - - if (!Traits::allowNonCharacters && QChar::isNonCharacter(ucs4)) - return Traits::Error; - - // first byte - Traits::appendByte(dst, 0xf0 | (uchar(ucs4 >> 18) & 0xf)); - - // second of four bytes - Traits::appendByte(dst, 0x80 | (uchar(ucs4 >> 12) & 0x3f)); - - // for the rest of the bytes - u = ushort(ucs4); - } - - // second to last byte - Traits::appendByte(dst, 0x80 | (uchar(u >> 6) & 0x3f)); - } - - // last byte - Traits::appendByte(dst, 0x80 | (u & 0x3f)); - return 0; - } - - inline bool isContinuationByte(uchar b) - { - return (b & 0xc0) == 0x80; - } - - /// returns the number of characters consumed (including \a b) in case of success; - /// returns negative in case of error: Traits::Error or Traits::EndOfString - template <typename Traits, typename OutputPtr, typename InputPtr> inline - int fromUtf8(uchar b, OutputPtr &dst, InputPtr &src, InputPtr end) - { - int charsNeeded; - uint min_uc; - uint uc; - - if (!Traits::skipAsciiHandling && b < 0x80) { - // US-ASCII - Traits::appendUtf16(dst, b); - return 1; - } - - if (!Traits::isTrusted && Q_UNLIKELY(b <= 0xC1)) { - // an UTF-8 first character must be at least 0xC0 - // however, all 0xC0 and 0xC1 first bytes can only produce overlong sequences - return Traits::Error; - } else if (b < 0xe0) { - charsNeeded = 2; - min_uc = 0x80; - uc = b & 0x1f; - } else if (b < 0xf0) { - charsNeeded = 3; - min_uc = 0x800; - uc = b & 0x0f; - } else if (b < 0xf5) { - charsNeeded = 4; - min_uc = 0x10000; - uc = b & 0x07; - } else { - // the last Unicode character is U+10FFFF - // it's encoded in UTF-8 as "\xF4\x8F\xBF\xBF" - // therefore, a byte higher than 0xF4 is not the UTF-8 first byte - return Traits::Error; - } - - int bytesAvailable = Traits::availableBytes(src, end); - if (Q_UNLIKELY(bytesAvailable < charsNeeded - 1)) { - // it's possible that we have an error instead of just unfinished bytes - if (bytesAvailable > 0 && !isContinuationByte(Traits::peekByte(src, 0))) - return Traits::Error; - if (bytesAvailable > 1 && !isContinuationByte(Traits::peekByte(src, 1))) - return Traits::Error; - return Traits::EndOfString; - } - - // first continuation character - b = Traits::peekByte(src, 0); - if (!isContinuationByte(b)) - return Traits::Error; - uc <<= 6; - uc |= b & 0x3f; - - if (charsNeeded > 2) { - // second continuation character - b = Traits::peekByte(src, 1); - if (!isContinuationByte(b)) - return Traits::Error; - uc <<= 6; - uc |= b & 0x3f; - - if (charsNeeded > 3) { - // third continuation character - b = Traits::peekByte(src, 2); - if (!isContinuationByte(b)) - return Traits::Error; - uc <<= 6; - uc |= b & 0x3f; - } - } - - // we've decoded something; safety-check it - if (!Traits::isTrusted) { - if (uc < min_uc) - return Traits::Error; - if (QChar::isSurrogate(uc) || uc > QChar::LastValidCodePoint) - return Traits::Error; - if (!Traits::allowNonCharacters && QChar::isNonCharacter(uc)) - return Traits::Error; - } - - // write the UTF-16 sequence - if (!QChar::requiresSurrogates(uc)) { - // UTF-8 decoded and no surrogates are required - // detach if necessary - Traits::appendUtf16(dst, ushort(uc)); - } else { - // UTF-8 decoded to something that requires a surrogate pair - Traits::appendUcs4(dst, uc); - } - - Traits::advanceByte(src, charsNeeded - 1); - return charsNeeded; - } -} - -enum DataEndianness -{ - DetectEndianness, - BigEndianness, - LittleEndianness -}; - -struct QUtf8 -{ - static QChar *convertToUnicode(QChar *, const char *, int) noexcept; - static QString convertToUnicode(const char *, int); - static QString convertToUnicode(const char *, int, QTextCodec::ConverterState *); - static QByteArray convertFromUnicode(const QChar *, int); - static QByteArray convertFromUnicode(const QChar *, int, QTextCodec::ConverterState *); - struct ValidUtf8Result { - bool isValidUtf8; - bool isValidAscii; - }; - static ValidUtf8Result isValidUtf8(const char *, qsizetype); - static int compareUtf8(const char *, qsizetype, const QChar *, int); - static int compareUtf8(const char *, qsizetype, QLatin1String s); -}; - -struct QUtf16 -{ - static QString convertToUnicode(const char *, int, QTextCodec::ConverterState *, DataEndianness = DetectEndianness); - static QByteArray convertFromUnicode(const QChar *, int, QTextCodec::ConverterState *, DataEndianness = DetectEndianness); -}; - -struct QUtf32 -{ - static QString convertToUnicode(const char *, int, QTextCodec::ConverterState *, DataEndianness = DetectEndianness); - static QByteArray convertFromUnicode(const QChar *, int, QTextCodec::ConverterState *, DataEndianness = DetectEndianness); -}; - -/* - Converts from different utf encodings looking at a possible byte order mark at the - beginning of the string. If no BOM exists, utf-8 is assumed. - */ -QString Q_CORE_EXPORT qFromUtfEncoded(const QByteArray &ba); - - #if QT_CONFIG(textcodec) class QUtf8Codec : public QTextCodec { diff --git a/src/corelib/global/qconfig-bootstrapped.h b/src/corelib/global/qconfig-bootstrapped.h index 349dfeea1c..6ef4acf503 100644 --- a/src/corelib/global/qconfig-bootstrapped.h +++ b/src/corelib/global/qconfig-bootstrapped.h @@ -141,18 +141,17 @@ #define QT_FEATURE_zstd -1 #endif +#define QT_FEATURE_textcodec -1 + #ifdef QT_BUILD_QMAKE #define QT_FEATURE_commandlineparser -1 #define QT_NO_COMPRESS #define QT_JSON_READONLY #define QT_FEATURE_settings 1 #define QT_NO_STANDARDPATHS -#define QT_FEATURE_textcodec -1 #else -#define QT_FEATURE_codecs -1 #define QT_FEATURE_commandlineparser 1 #define QT_FEATURE_settings -1 -#define QT_FEATURE_textcodec 1 #endif #endif // QT_BOOTSTRAPPED diff --git a/src/corelib/io/qfilesystemiterator_unix.cpp b/src/corelib/io/qfilesystemiterator_unix.cpp index ceea3a467c..4bc6b2e31b 100644 --- a/src/corelib/io/qfilesystemiterator_unix.cpp +++ b/src/corelib/io/qfilesystemiterator_unix.cpp @@ -42,7 +42,7 @@ #if QT_CONFIG(textcodec) # include <qtextcodec.h> -# include <private/qutfcodec_p.h> +# include <private/qstringconverter_p.h> #endif #ifndef QT_NO_FILESYSTEMITERATOR diff --git a/src/corelib/io/qurlrecode.cpp b/src/corelib/io/qurlrecode.cpp index 1c9d0d1d4b..2788de3b3a 100644 --- a/src/corelib/io/qurlrecode.cpp +++ b/src/corelib/io/qurlrecode.cpp @@ -38,7 +38,7 @@ ****************************************************************************/ #include "qurl.h" -#include "private/qutfcodec_p.h" +#include "private/qstringconverter_p.h" #include "private/qtools_p.h" #include "private/qsimd_p.h" diff --git a/src/corelib/serialization/qcborstreamreader.cpp b/src/corelib/serialization/qcborstreamreader.cpp index ec385e0629..c49a76aada 100644 --- a/src/corelib/serialization/qcborstreamreader.cpp +++ b/src/corelib/serialization/qcborstreamreader.cpp @@ -44,7 +44,7 @@ #include <private/qbytearray_p.h> #include <private/qnumeric_p.h> -#include <private/qutfcodec_p.h> +#include <private/qstringconverter_p.h> #include <qdebug.h> #include <qstack.h> diff --git a/src/corelib/serialization/qcborvalue_p.h b/src/corelib/serialization/qcborvalue_p.h index 1d686f118b..38383c7522 100644 --- a/src/corelib/serialization/qcborvalue_p.h +++ b/src/corelib/serialization/qcborvalue_p.h @@ -54,7 +54,7 @@ #include "qcborvalue.h" #include <private/qglobal_p.h> -#include <private/qutfcodec_p.h> +#include <private/qstringconverter_p.h> #include <math.h> diff --git a/src/corelib/serialization/qjsonparser.cpp b/src/corelib/serialization/qjsonparser.cpp index 46d82ea47f..116e7f6995 100644 --- a/src/corelib/serialization/qjsonparser.cpp +++ b/src/corelib/serialization/qjsonparser.cpp @@ -44,7 +44,7 @@ #include <qdebug.h> #include "qjsonparser_p.h" #include "qjson_p.h" -#include "private/qutfcodec_p.h" +#include "private/qstringconverter_p.h" #include "private/qcborvalue_p.h" #include "private/qnumeric_p.h" diff --git a/src/corelib/serialization/qjsonwriter.cpp b/src/corelib/serialization/qjsonwriter.cpp index 590b59f09c..8610cdff7e 100644 --- a/src/corelib/serialization/qjsonwriter.cpp +++ b/src/corelib/serialization/qjsonwriter.cpp @@ -42,7 +42,7 @@ #include <qlocale.h> #include "qjsonwriter_p.h" #include "qjson_p.h" -#include "private/qutfcodec_p.h" +#include "private/qstringconverter_p.h" #include <private/qnumeric_p.h> #include <private/qcborvalue_p.h> diff --git a/src/corelib/text/qstring.cpp b/src/corelib/text/qstring.cpp index 0682395ebf..68a0f757c8 100644 --- a/src/corelib/text/qstring.cpp +++ b/src/corelib/text/qstring.cpp @@ -48,7 +48,7 @@ #if QT_CONFIG(textcodec) #include <qtextcodec.h> #endif -#include <private/qutfcodec_p.h> +#include <private/qstringconverter_p.h> #include "qlocale_tools_p.h" #include "private/qsimd_p.h" #include <qnumeric.h> diff --git a/src/corelib/text/qstringbuilder.cpp b/src/corelib/text/qstringbuilder.cpp index 29bd216e80..4e47ba0922 100644 --- a/src/corelib/text/qstringbuilder.cpp +++ b/src/corelib/text/qstringbuilder.cpp @@ -38,7 +38,7 @@ ****************************************************************************/ #include "qstringbuilder.h" -#include <private/qutfcodec_p.h> +#include <private/qstringconverter_p.h> QT_BEGIN_NAMESPACE diff --git a/src/corelib/text/qstringconverter.cpp b/src/corelib/text/qstringconverter.cpp index 1f61eee5cb..36567f5106 100644 --- a/src/corelib/text/qstringconverter.cpp +++ b/src/corelib/text/qstringconverter.cpp @@ -39,10 +39,954 @@ ****************************************************************************/ #include <qstringconverter.h> -#include <private/qutfcodec_p.h> +#include <private/qstringconverter_p.h> +#include "qendian.h" + +#include "private/qsimd_p.h" +#include "private/qstringiterator_p.h" QT_BEGIN_NAMESPACE +enum { Endian = 0, Data = 1 }; + +static const uchar utf8bom[] = { 0xef, 0xbb, 0xbf }; + +#if (defined(__SSE2__) && defined(QT_COMPILER_SUPPORTS_SSE2)) \ + || (defined(__ARM_NEON__) && defined(Q_PROCESSOR_ARM_64)) +static Q_ALWAYS_INLINE uint qBitScanReverse(unsigned v) noexcept +{ + uint result = qCountLeadingZeroBits(v); + // Now Invert the result: clz will count *down* from the msb to the lsb, so the msb index is 31 + // and the lsb index is 0. The result for _bit_scan_reverse is expected to be the index when + // counting up: msb index is 0 (because it starts there), and the lsb index is 31. + result ^= sizeof(unsigned) * 8 - 1; + return result; +} +#endif + +#if defined(__SSE2__) && defined(QT_COMPILER_SUPPORTS_SSE2) +static inline bool simdEncodeAscii(uchar *&dst, const ushort *&nextAscii, const ushort *&src, const ushort *end) +{ + // do sixteen characters at a time + for ( ; end - src >= 16; src += 16, dst += 16) { +# ifdef __AVX2__ + __m256i data = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src)); + __m128i data1 = _mm256_castsi256_si128(data); + __m128i data2 = _mm256_extracti128_si256(data, 1); +# else + __m128i data1 = _mm_loadu_si128((const __m128i*)src); + __m128i data2 = _mm_loadu_si128(1+(const __m128i*)src); +# endif + + // check if everything is ASCII + // the highest ASCII value is U+007F + // Do the packing directly: + // The PACKUSWB instruction has packs a signed 16-bit integer to an unsigned 8-bit + // with saturation. That is, anything from 0x0100 to 0x7fff is saturated to 0xff, + // while all negatives (0x8000 to 0xffff) get saturated to 0x00. To detect non-ASCII, + // we simply do a signed greater-than comparison to 0x00. That means we detect NULs as + // "non-ASCII", but it's an acceptable compromise. + __m128i packed = _mm_packus_epi16(data1, data2); + __m128i nonAscii = _mm_cmpgt_epi8(packed, _mm_setzero_si128()); + + // store, even if there are non-ASCII characters here + _mm_storeu_si128((__m128i*)dst, packed); + + // n will contain 1 bit set per character in [data1, data2] that is non-ASCII (or NUL) + ushort n = ~_mm_movemask_epi8(nonAscii); + if (n) { + // find the next probable ASCII character + // we don't want to load 32 bytes again in this loop if we know there are non-ASCII + // characters still coming + nextAscii = src + qBitScanReverse(n) + 1; + + n = qCountTrailingZeroBits(n); + dst += n; + src += n; + return false; + } + } + + if (end - src >= 8) { + // do eight characters at a time + __m128i data = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src)); + __m128i packed = _mm_packus_epi16(data, data); + __m128i nonAscii = _mm_cmpgt_epi8(packed, _mm_setzero_si128()); + + // store even non-ASCII + _mm_storel_epi64(reinterpret_cast<__m128i *>(dst), packed); + + uchar n = ~_mm_movemask_epi8(nonAscii); + if (n) { + nextAscii = src + qBitScanReverse(n) + 1; + n = qCountTrailingZeroBits(n); + dst += n; + src += n; + return false; + } + } + + return src == end; +} + +static inline bool simdDecodeAscii(ushort *&dst, const uchar *&nextAscii, const uchar *&src, const uchar *end) +{ + // do sixteen characters at a time + for ( ; end - src >= 16; src += 16, dst += 16) { + __m128i data = _mm_loadu_si128((const __m128i*)src); + +#ifdef __AVX2__ + const int BitSpacing = 2; + // load and zero extend to an YMM register + const __m256i extended = _mm256_cvtepu8_epi16(data); + + uint n = _mm256_movemask_epi8(extended); + if (!n) { + // store + _mm256_storeu_si256((__m256i*)dst, extended); + continue; + } +#else + const int BitSpacing = 1; + + // check if everything is ASCII + // movemask extracts the high bit of every byte, so n is non-zero if something isn't ASCII + uint n = _mm_movemask_epi8(data); + if (!n) { + // unpack + _mm_storeu_si128((__m128i*)dst, _mm_unpacklo_epi8(data, _mm_setzero_si128())); + _mm_storeu_si128(1+(__m128i*)dst, _mm_unpackhi_epi8(data, _mm_setzero_si128())); + continue; + } +#endif + + // copy the front part that is still ASCII + while (!(n & 1)) { + *dst++ = *src++; + n >>= BitSpacing; + } + + // find the next probable ASCII character + // we don't want to load 16 bytes again in this loop if we know there are non-ASCII + // characters still coming + n = qBitScanReverse(n); + nextAscii = src + (n / BitSpacing) + 1; + return false; + + } + + if (end - src >= 8) { + __m128i data = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(src)); + uint n = _mm_movemask_epi8(data) & 0xff; + if (!n) { + // unpack and store + _mm_storeu_si128(reinterpret_cast<__m128i *>(dst), _mm_unpacklo_epi8(data, _mm_setzero_si128())); + } else { + while (!(n & 1)) { + *dst++ = *src++; + n >>= 1; + } + + n = qBitScanReverse(n); + nextAscii = src + n + 1; + return false; + } + } + + return src == end; +} + +static inline const uchar *simdFindNonAscii(const uchar *src, const uchar *end, const uchar *&nextAscii) +{ +#ifdef __AVX2__ + // do 32 characters at a time + // (this is similar to simdTestMask in qstring.cpp) + const __m256i mask = _mm256_set1_epi8(0x80); + for ( ; end - src >= 32; src += 32) { + __m256i data = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src)); + if (_mm256_testz_si256(mask, data)) + continue; + + uint n = _mm256_movemask_epi8(data); + Q_ASSUME(n); + + // find the next probable ASCII character + // we don't want to load 32 bytes again in this loop if we know there are non-ASCII + // characters still coming + nextAscii = src + qBitScanReverse(n) + 1; + + // return the non-ASCII character + return src + qCountTrailingZeroBits(n); + } +#endif + + // do sixteen characters at a time + for ( ; end - src >= 16; src += 16) { + __m128i data = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src)); + + // check if everything is ASCII + // movemask extracts the high bit of every byte, so n is non-zero if something isn't ASCII + uint n = _mm_movemask_epi8(data); + if (!n) + continue; + + // find the next probable ASCII character + // we don't want to load 16 bytes again in this loop if we know there are non-ASCII + // characters still coming + nextAscii = src + qBitScanReverse(n) + 1; + + // return the non-ASCII character + return src + qCountTrailingZeroBits(n); + } + + // do four characters at a time + for ( ; end - src >= 4; src += 4) { + quint32 data = qFromUnaligned<quint32>(src); + data &= 0x80808080U; + if (!data) + continue; + + // We don't try to guess which of the three bytes is ASCII and which + // one isn't. The chance that at least two of them are non-ASCII is + // better than 75%. + nextAscii = src; + return src; + } + nextAscii = end; + return src; +} +#elif defined(__ARM_NEON__) && defined(Q_PROCESSOR_ARM_64) // vaddv is only available on Aarch64 +static inline bool simdEncodeAscii(uchar *&dst, const ushort *&nextAscii, const ushort *&src, const ushort *end) +{ + uint16x8_t maxAscii = vdupq_n_u16(0x7f); + uint16x8_t mask1 = { 1, 1 << 2, 1 << 4, 1 << 6, 1 << 8, 1 << 10, 1 << 12, 1 << 14 }; + uint16x8_t mask2 = vshlq_n_u16(mask1, 1); + + // do sixteen characters at a time + for ( ; end - src >= 16; src += 16, dst += 16) { + // load 2 lanes (or: "load interleaved") + uint16x8x2_t in = vld2q_u16(src); + + // check if any of the elements > 0x7f, select 1 bit per element (element 0 -> bit 0, element 1 -> bit 1, etc), + // add those together into a scalar, and merge the scalars. + uint16_t nonAscii = vaddvq_u16(vandq_u16(vcgtq_u16(in.val[0], maxAscii), mask1)) + | vaddvq_u16(vandq_u16(vcgtq_u16(in.val[1], maxAscii), mask2)); + + // merge the two lanes by shifting the values of the second by 8 and inserting them + uint16x8_t out = vsliq_n_u16(in.val[0], in.val[1], 8); + + // store, even if there are non-ASCII characters here + vst1q_u8(dst, vreinterpretq_u8_u16(out)); + + if (nonAscii) { + // find the next probable ASCII character + // we don't want to load 32 bytes again in this loop if we know there are non-ASCII + // characters still coming + nextAscii = src + qBitScanReverse(nonAscii) + 1; + + nonAscii = qCountTrailingZeroBits(nonAscii); + dst += nonAscii; + src += nonAscii; + return false; + } + } + return src == end; +} + +static inline bool simdDecodeAscii(ushort *&dst, const uchar *&nextAscii, const uchar *&src, const uchar *end) +{ + // do eight characters at a time + uint8x8_t msb_mask = vdup_n_u8(0x80); + uint8x8_t add_mask = { 1, 1 << 1, 1 << 2, 1 << 3, 1 << 4, 1 << 5, 1 << 6, 1 << 7 }; + for ( ; end - src >= 8; src += 8, dst += 8) { + uint8x8_t c = vld1_u8(src); + uint8_t n = vaddv_u8(vand_u8(vcge_u8(c, msb_mask), add_mask)); + if (!n) { + // store + vst1q_u16(dst, vmovl_u8(c)); + continue; + } + + // copy the front part that is still ASCII + while (!(n & 1)) { + *dst++ = *src++; + n >>= 1; + } + + // find the next probable ASCII character + // we don't want to load 16 bytes again in this loop if we know there are non-ASCII + // characters still coming + n = qBitScanReverse(n); + nextAscii = src + n + 1; + return false; + + } + return src == end; +} + +static inline const uchar *simdFindNonAscii(const uchar *src, const uchar *end, const uchar *&nextAscii) +{ + // The SIMD code below is untested, so just force an early return until + // we've had the time to verify it works. + nextAscii = end; + return src; + + // do eight characters at a time + uint8x8_t msb_mask = vdup_n_u8(0x80); + uint8x8_t add_mask = { 1, 1 << 1, 1 << 2, 1 << 3, 1 << 4, 1 << 5, 1 << 6, 1 << 7 }; + for ( ; end - src >= 8; src += 8) { + uint8x8_t c = vld1_u8(src); + uint8_t n = vaddv_u8(vand_u8(vcge_u8(c, msb_mask), add_mask)); + if (!n) + continue; + + // find the next probable ASCII character + // we don't want to load 16 bytes again in this loop if we know there are non-ASCII + // characters still coming + nextAscii = src + qBitScanReverse(n) + 1; + + // return the non-ASCII character + return src + qCountTrailingZeroBits(n); + } + nextAscii = end; + return src; +} +#else +static inline bool simdEncodeAscii(uchar *, const ushort *, const ushort *, const ushort *) +{ + return false; +} + +static inline bool simdDecodeAscii(ushort *, const uchar *, const uchar *, const uchar *) +{ + return false; +} + +static inline const uchar *simdFindNonAscii(const uchar *src, const uchar *end, const uchar *&nextAscii) +{ + nextAscii = end; + return src; +} +#endif + +QByteArray QUtf8::convertFromUnicode(const QChar *uc, qsizetype len) +{ + // create a QByteArray with the worst case scenario size + QByteArray result(len * 3, Qt::Uninitialized); + uchar *dst = reinterpret_cast<uchar *>(const_cast<char *>(result.constData())); + const ushort *src = reinterpret_cast<const ushort *>(uc); + const ushort *const end = src + len; + + while (src != end) { + const ushort *nextAscii = end; + if (simdEncodeAscii(dst, nextAscii, src, end)) + break; + + do { + ushort uc = *src++; + int res = QUtf8Functions::toUtf8<QUtf8BaseTraits>(uc, dst, src, end); + if (res < 0) { + // encoding error - append '?' + *dst++ = '?'; + } + } while (src < nextAscii); + } + + result.truncate(dst - reinterpret_cast<uchar *>(const_cast<char *>(result.constData()))); + return result; +} + +QByteArray QUtf8::convertFromUnicode(const QChar *uc, qsizetype len, QStringConverter::State *state) +{ + uchar replacement = '?'; + qsizetype rlen = 3*len; + int surrogate_high = -1; + if (state) { + if (state->flags & QStringConverter::ConvertInvalidToNull) + replacement = 0; + if (!(state->flags & QStringConverter::IgnoreHeader)) + rlen += 3; + if (state->remainingChars) + surrogate_high = state->state_data[0]; + } + + + QByteArray rstr(rlen, Qt::Uninitialized); + uchar *cursor = reinterpret_cast<uchar *>(const_cast<char *>(rstr.constData())); + const ushort *src = reinterpret_cast<const ushort *>(uc); + const ushort *const end = src + len; + + int invalid = 0; + if (state && !(state->flags & QStringConverter::IgnoreHeader)) { + // append UTF-8 BOM + *cursor++ = utf8bom[0]; + *cursor++ = utf8bom[1]; + *cursor++ = utf8bom[2]; + } + + const ushort *nextAscii = src; + while (src != end) { + int res; + ushort uc; + if (surrogate_high != -1) { + uc = surrogate_high; + surrogate_high = -1; + res = QUtf8Functions::toUtf8<QUtf8BaseTraits>(uc, cursor, src, end); + } else { + if (src >= nextAscii && simdEncodeAscii(cursor, nextAscii, src, end)) + break; + + uc = *src++; + res = QUtf8Functions::toUtf8<QUtf8BaseTraits>(uc, cursor, src, end); + } + if (Q_LIKELY(res >= 0)) + continue; + + if (res == QUtf8BaseTraits::Error) { + // encoding error + ++invalid; + *cursor++ = replacement; + } else if (res == QUtf8BaseTraits::EndOfString) { + surrogate_high = uc; + break; + } + } + + rstr.resize(cursor - (const uchar*)rstr.constData()); + if (state) { + state->invalidChars += invalid; + state->flags |= QStringConverter::IgnoreHeader; + state->remainingChars = 0; + if (surrogate_high >= 0) { + state->remainingChars = 1; + state->state_data[0] = surrogate_high; + } + } + return rstr; +} + +QString QUtf8::convertToUnicode(const char *chars, qsizetype len) +{ + // UTF-8 to UTF-16 always needs the exact same number of words or less: + // UTF-8 UTF-16 + // 1 byte 1 word + // 2 bytes 1 word + // 3 bytes 1 word + // 4 bytes 2 words (one surrogate pair) + // That is, we'll use the full buffer if the input is US-ASCII (1-byte UTF-8), + // half the buffer for U+0080-U+07FF text (e.g., Greek, Cyrillic, Arabic) or + // non-BMP text, and one third of the buffer for U+0800-U+FFFF text (e.g, CJK). + // + // The table holds for invalid sequences too: we'll insert one replacement char + // per invalid byte. + QString result(len, Qt::Uninitialized); + QChar *data = const_cast<QChar*>(result.constData()); // we know we're not shared + const QChar *end = convertToUnicode(data, chars, len); + result.truncate(end - data); + return result; +} + +/*! + \since 5.7 + \overload + + Converts the UTF-8 sequence of \a len octets beginning at \a chars to + a sequence of QChar starting at \a buffer. The buffer is expected to be + large enough to hold the result. An upper bound for the size of the + buffer is \a len QChars. + + If, during decoding, an error occurs, a QChar::ReplacementCharacter is + written. + + Returns a pointer to one past the last QChar written. + + This function never throws. +*/ + +QChar *QUtf8::convertToUnicode(QChar *buffer, const char *chars, qsizetype len) noexcept +{ + ushort *dst = reinterpret_cast<ushort *>(buffer); + const uchar *src = reinterpret_cast<const uchar *>(chars); + const uchar *end = src + len; + + // attempt to do a full decoding in SIMD + const uchar *nextAscii = end; + if (!simdDecodeAscii(dst, nextAscii, src, end)) { + // at least one non-ASCII entry + // check if we failed to decode the UTF-8 BOM; if so, skip it + if (Q_UNLIKELY(src == reinterpret_cast<const uchar *>(chars)) + && end - src >= 3 + && Q_UNLIKELY(src[0] == utf8bom[0] && src[1] == utf8bom[1] && src[2] == utf8bom[2])) { + src += 3; + } + + while (src < end) { + nextAscii = end; + if (simdDecodeAscii(dst, nextAscii, src, end)) + break; + + do { + uchar b = *src++; + int res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(b, dst, src, end); + if (res < 0) { + // decoding error + *dst++ = QChar::ReplacementCharacter; + } + } while (src < nextAscii); + } + } + + return reinterpret_cast<QChar *>(dst); +} + +QString QUtf8::convertToUnicode(const char *chars, qsizetype len, QStringConverter::State *state) +{ + bool headerdone = false; + ushort replacement = QChar::ReplacementCharacter; + int invalid = 0; + int res; + uchar ch = 0; + + // See above for buffer requirements for stateless decoding. However, that + // fails if the state is not empty. The following situations can add to the + // requirements: + // state contains chars starts with requirement + // 1 of 2 bytes valid continuation 0 + // 2 of 3 bytes same 0 + // 3 bytes of 4 same +1 (need to insert surrogate pair) + // 1 of 2 bytes invalid continuation +1 (need to insert replacement and restart) + // 2 of 3 bytes same +1 (same) + // 3 of 4 bytes same +1 (same) + QString result(len + 1, Qt::Uninitialized); + + ushort *dst = reinterpret_cast<ushort *>(const_cast<QChar *>(result.constData())); + const uchar *src = reinterpret_cast<const uchar *>(chars); + const uchar *end = src + len; + + if (state) { + if (state->flags & QStringConverter::IgnoreHeader) + headerdone = true; + if (state->flags & QStringConverter::ConvertInvalidToNull) + replacement = QChar::Null; + if (state->remainingChars) { + // handle incoming state first + uchar remainingCharsData[4]; // longest UTF-8 sequence possible + qsizetype remainingCharsCount = state->remainingChars; + qsizetype newCharsToCopy = qMin<int>(sizeof(remainingCharsData) - remainingCharsCount, end - src); + + memset(remainingCharsData, 0, sizeof(remainingCharsData)); + memcpy(remainingCharsData, &state->state_data[0], remainingCharsCount); + memcpy(remainingCharsData + remainingCharsCount, src, newCharsToCopy); + + const uchar *begin = &remainingCharsData[1]; + res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(remainingCharsData[0], dst, begin, + static_cast<const uchar *>(remainingCharsData) + remainingCharsCount + newCharsToCopy); + if (res == QUtf8BaseTraits::Error || (res == QUtf8BaseTraits::EndOfString && len == 0)) { + // special case for len == 0: + // if we were supplied an empty string, terminate the previous, unfinished sequence with error + ++invalid; + *dst++ = replacement; + } else if (res == QUtf8BaseTraits::EndOfString) { + // if we got EndOfString again, then there were too few bytes in src; + // copy to our state and return + state->remainingChars = remainingCharsCount + newCharsToCopy; + memcpy(&state->state_data[0], remainingCharsData, state->remainingChars); + return QString(); + } else if (!headerdone && res >= 0) { + // eat the UTF-8 BOM + headerdone = true; + if (dst[-1] == 0xfeff) + --dst; + } + + // adjust src now that we have maybe consumed a few chars + if (res >= 0) { + Q_ASSERT(res > remainingCharsCount); + src += res - remainingCharsCount; + } + } + } + + // main body, stateless decoding + res = 0; + const uchar *nextAscii = src; + const uchar *start = src; + while (res >= 0 && src < end) { + if (src >= nextAscii && simdDecodeAscii(dst, nextAscii, src, end)) + break; + + ch = *src++; + res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(ch, dst, src, end); + if (!headerdone && res >= 0) { + headerdone = true; + if (src == start + 3) { // 3 == sizeof(utf8-bom) + // eat the UTF-8 BOM (it can only appear at the beginning of the string). + if (dst[-1] == 0xfeff) + --dst; + } + } + if (res == QUtf8BaseTraits::Error) { + res = 0; + ++invalid; + *dst++ = replacement; + } + } + + if (!state && res == QUtf8BaseTraits::EndOfString) { + // unterminated UTF sequence + *dst++ = QChar::ReplacementCharacter; + while (src++ < end) + *dst++ = QChar::ReplacementCharacter; + } + + result.truncate(dst - (const ushort *)result.unicode()); + if (state) { + state->invalidChars += invalid; + if (headerdone) + state->flags |= QStringConverter::IgnoreHeader; + if (res == QUtf8BaseTraits::EndOfString) { + --src; // unread the byte in ch + state->remainingChars = end - src; + memcpy(&state->state_data[0], src, end - src); + } else { + state->remainingChars = 0; + } + } + return result; +} + +struct QUtf8NoOutputTraits : public QUtf8BaseTraitsNoAscii +{ + struct NoOutput {}; + static void appendUtf16(const NoOutput &, ushort) {} + static void appendUcs4(const NoOutput &, uint) {} +}; + +QUtf8::ValidUtf8Result QUtf8::isValidUtf8(const char *chars, qsizetype len) +{ + const uchar *src = reinterpret_cast<const uchar *>(chars); + const uchar *end = src + len; + const uchar *nextAscii = src; + bool isValidAscii = true; + + while (src < end) { + if (src >= nextAscii) + src = simdFindNonAscii(src, end, nextAscii); + if (src == end) + break; + + do { + uchar b = *src++; + if ((b & 0x80) == 0) + continue; + + isValidAscii = false; + QUtf8NoOutputTraits::NoOutput output; + int res = QUtf8Functions::fromUtf8<QUtf8NoOutputTraits>(b, output, src, end); + if (res < 0) { + // decoding error + return { false, false }; + } + } while (src < nextAscii); + } + + return { true, isValidAscii }; +} + +int QUtf8::compareUtf8(const char *utf8, qsizetype u8len, const QChar *utf16, qsizetype u16len) +{ + uint uc1, uc2; + auto src1 = reinterpret_cast<const uchar *>(utf8); + auto end1 = src1 + u8len; + QStringIterator src2(utf16, utf16 + u16len); + + while (src1 < end1 && src2.hasNext()) { + uchar b = *src1++; + uint *output = &uc1; + int res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(b, output, src1, end1); + if (res < 0) { + // decoding error + uc1 = QChar::ReplacementCharacter; + } + + uc2 = src2.next(); + if (uc1 != uc2) + return int(uc1) - int(uc2); + } + + // the shorter string sorts first + return (end1 > src1) - int(src2.hasNext()); +} + +int QUtf8::compareUtf8(const char *utf8, qsizetype u8len, QLatin1String s) +{ + uint uc1; + auto src1 = reinterpret_cast<const uchar *>(utf8); + auto end1 = src1 + u8len; + auto src2 = reinterpret_cast<const uchar *>(s.latin1()); + auto end2 = src2 + s.size(); + + while (src1 < end1 && src2 < end2) { + uchar b = *src1++; + uint *output = &uc1; + int res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(b, output, src1, end1); + if (res < 0) { + // decoding error + uc1 = QChar::ReplacementCharacter; + } + + uint uc2 = *src2++; + if (uc1 != uc2) + return int(uc1) - int(uc2); + } + + // the shorter string sorts first + return (end1 > src1) - (end2 > src2); +} + +QByteArray QUtf16::convertFromUnicode(const QChar *uc, qsizetype len, QStringConverter::State *state, DataEndianness e) +{ + DataEndianness endian = e; + qsizetype length = 2*len; + if (!state || (!(state->flags & QStringConverter::IgnoreHeader))) { + length += 2; + } + if (e == DetectEndianness) { + endian = (QSysInfo::ByteOrder == QSysInfo::BigEndian) ? BigEndianness : LittleEndianness; + } + + QByteArray d; + d.resize(length); + char *data = d.data(); + if (!state || !(state->flags & QStringConverter::IgnoreHeader)) { + QChar bom(QChar::ByteOrderMark); + if (endian == BigEndianness) + qToBigEndian(bom.unicode(), data); + else + qToLittleEndian(bom.unicode(), data); + data += 2; + } + if (endian == BigEndianness) + qToBigEndian<ushort>(uc, len, data); + else + qToLittleEndian<ushort>(uc, len, data); + + if (state) { + state->remainingChars = 0; + state->flags |= QStringConverter::IgnoreHeader; + } + return d; +} + +QString QUtf16::convertToUnicode(const char *chars, qsizetype len, QStringConverter::State *state, DataEndianness e) +{ + DataEndianness endian = e; + bool half = false; + uchar buf = 0; + bool headerdone = false; + if (state) { + headerdone = state->flags & QStringConverter::IgnoreHeader; + if (endian == DetectEndianness) + endian = (DataEndianness)state->state_data[Endian]; + if (state->remainingChars) { + half = true; + buf = state->state_data[Data]; + } + } + if (headerdone && endian == DetectEndianness) + endian = (QSysInfo::ByteOrder == QSysInfo::BigEndian) ? BigEndianness : LittleEndianness; + + QString result(len, Qt::Uninitialized); // worst case + QChar *qch = (QChar *)result.data(); + while (len--) { + if (half) { + QChar ch; + if (endian == LittleEndianness) { + ch.setRow(*chars++); + ch.setCell(buf); + } else { + ch.setRow(buf); + ch.setCell(*chars++); + } + if (!headerdone) { + headerdone = true; + if (endian == DetectEndianness) { + if (ch == QChar::ByteOrderSwapped) { + endian = LittleEndianness; + } else if (ch == QChar::ByteOrderMark) { + endian = BigEndianness; + } else { + if (QSysInfo::ByteOrder == QSysInfo::BigEndian) { + endian = BigEndianness; + } else { + endian = LittleEndianness; + ch = QChar::fromUcs2((ch.unicode() >> 8) | ((ch.unicode() & 0xff) << 8)); + } + *qch++ = ch; + } + } else if (ch != QChar::ByteOrderMark) { + *qch++ = ch; + } + } else { + *qch++ = ch; + } + half = false; + } else { + buf = *chars++; + half = true; + } + } + result.truncate(qch - result.unicode()); + + if (state) { + if (headerdone) + state->flags |= QStringConverter::IgnoreHeader; + state->state_data[Endian] = endian; + if (half) { + state->remainingChars = 1; + state->state_data[Data] = buf; + } else { + state->remainingChars = 0; + state->state_data[Data] = 0; + } + } + return result; +} + +QByteArray QUtf32::convertFromUnicode(const QChar *uc, qsizetype len, QStringConverter::State *state, DataEndianness e) +{ + DataEndianness endian = e; + qsizetype length = 4*len; + if (!state || (!(state->flags & QStringConverter::IgnoreHeader))) { + length += 4; + } + if (e == DetectEndianness) { + endian = (QSysInfo::ByteOrder == QSysInfo::BigEndian) ? BigEndianness : LittleEndianness; + } + + QByteArray d(length, Qt::Uninitialized); + char *data = d.data(); + if (!state || !(state->flags & QStringConverter::IgnoreHeader)) { + if (endian == BigEndianness) { + data[0] = 0; + data[1] = 0; + data[2] = (char)0xfe; + data[3] = (char)0xff; + } else { + data[0] = (char)0xff; + data[1] = (char)0xfe; + data[2] = 0; + data[3] = 0; + } + data += 4; + } + + QStringIterator i(uc, uc + len); + if (endian == BigEndianness) { + while (i.hasNext()) { + uint cp = i.next(); + qToBigEndian(cp, data); + data += 4; + } + } else { + while (i.hasNext()) { + uint cp = i.next(); + qToLittleEndian(cp, data); + data += 4; + } + } + + if (state) { + state->remainingChars = 0; + state->flags |= QStringConverter::IgnoreHeader; + } + return d; +} + +QString QUtf32::convertToUnicode(const char *chars, qsizetype len, QStringConverter::State *state, DataEndianness e) +{ + DataEndianness endian = e; + uchar tuple[4]; + int num = 0; + bool headerdone = false; + if (state) { + headerdone = state->flags & QStringConverter::IgnoreHeader; + if (endian == DetectEndianness) { + endian = (DataEndianness)state->state_data[Endian]; + } + num = state->remainingChars; + memcpy(tuple, &state->state_data[Data], 4); + } + if (headerdone && endian == DetectEndianness) + endian = (QSysInfo::ByteOrder == QSysInfo::BigEndian) ? BigEndianness : LittleEndianness; + + QString result; + result.resize((num + len) >> 2 << 1); // worst case + QChar *qch = (QChar *)result.data(); + + const char *end = chars + len; + while (chars < end) { + tuple[num++] = *chars++; + if (num == 4) { + if (!headerdone) { + headerdone = true; + if (endian == DetectEndianness) { + if (tuple[0] == 0xff && tuple[1] == 0xfe && tuple[2] == 0 && tuple[3] == 0 && endian != BigEndianness) { + endian = LittleEndianness; + num = 0; + continue; + } else if (tuple[0] == 0 && tuple[1] == 0 && tuple[2] == 0xfe && tuple[3] == 0xff && endian != LittleEndianness) { + endian = BigEndianness; + num = 0; + continue; + } else if (QSysInfo::ByteOrder == QSysInfo::BigEndian) { + endian = BigEndianness; + } else { + endian = LittleEndianness; + } + } else if (((endian == BigEndianness) ? qFromBigEndian<quint32>(tuple) : qFromLittleEndian<quint32>(tuple)) == QChar::ByteOrderMark) { + num = 0; + continue; + } + } + uint code = (endian == BigEndianness) ? qFromBigEndian<quint32>(tuple) : qFromLittleEndian<quint32>(tuple); + for (char16_t c : QChar::fromUcs4(code)) + *qch++ = c; + num = 0; + } + } + result.truncate(qch - result.unicode()); + + if (state) { + if (headerdone) + state->flags |= QStringConverter::IgnoreHeader; + state->state_data[Endian] = endian; + state->remainingChars = num; + memcpy(&state->state_data[Data], tuple, 4); + } + return result; +} + +QString qFromUtfEncoded(const QByteArray &ba) +{ + const qsizetype arraySize = ba.size(); + const uchar *buf = reinterpret_cast<const uchar *>(ba.constData()); + const uint bom = 0xfeff; + + if (arraySize > 3) { + uint uc = qFromUnaligned<uint>(buf); + if (uc == qToBigEndian(bom) || uc == qToLittleEndian(bom)) + return QUtf32::convertToUnicode(ba.constData(), ba.length(), nullptr); // utf-32 + } + + if (arraySize > 1) { + ushort uc = qFromUnaligned<ushort>(buf); + if (uc == qToBigEndian(ushort(bom)) || qToLittleEndian(ushort(bom))) + return QUtf16::convertToUnicode(ba.constData(), ba.length(), nullptr); // utf-16 + } + return QUtf8::convertToUnicode(ba.constData(), ba.length()); +} + /*! \enum QStringConverter::Flag @@ -60,7 +1004,8 @@ void QStringConverter::State::clear() { if (clearFn) clearFn(this); - state_data[0] = state_data[1] = state_data[2] = state_data[3] = 0; + else + state_data[0] = state_data[1] = state_data[2] = state_data[3] = 0; remainingChars = 0; invalidChars = 0; } diff --git a/src/corelib/text/qstringconverter_p.h b/src/corelib/text/qstringconverter_p.h new file mode 100644 index 0000000000..5764979542 --- /dev/null +++ b/src/corelib/text/qstringconverter_p.h @@ -0,0 +1,323 @@ +/**************************************************************************** +** +** Copyright (C) 2018 The Qt Company Ltd. +** Copyright (C) 2018 Intel Corporation. +** Contact: https://www.qt.io/licensing/ +** +** This file is part of the QtCore module of the Qt Toolkit. +** +** $QT_BEGIN_LICENSE:LGPL$ +** Commercial License Usage +** Licensees holding valid commercial Qt licenses may use this file in +** accordance with the commercial license agreement provided with the +** Software or, alternatively, in accordance with the terms contained in +** a written agreement between you and The Qt Company. For licensing terms +** and conditions see https://www.qt.io/terms-conditions. For further +** information use the contact form at https://www.qt.io/contact-us. +** +** GNU Lesser General Public License Usage +** Alternatively, this file may be used under the terms of the GNU Lesser +** General Public License version 3 as published by the Free Software +** Foundation and appearing in the file LICENSE.LGPL3 included in the +** packaging of this file. Please review the following information to +** ensure the GNU Lesser General Public License version 3 requirements +** will be met: https://www.gnu.org/licenses/lgpl-3.0.html. +** +** GNU General Public License Usage +** Alternatively, this file may be used under the terms of the GNU +** General Public License version 2.0 or (at your option) the GNU General +** Public license version 3 or any later version approved by the KDE Free +** Qt Foundation. The licenses are as published by the Free Software +** Foundation and appearing in the file LICENSE.GPL2 and LICENSE.GPL3 +** included in the packaging of this file. Please review the following +** information to ensure the GNU General Public License requirements will +** be met: https://www.gnu.org/licenses/gpl-2.0.html and +** https://www.gnu.org/licenses/gpl-3.0.html. +** +** $QT_END_LICENSE$ +** +****************************************************************************/ + +#ifndef QSTRINGCONVERTER_P_H +#define QSTRINGCONVERTER_P_H + +// +// W A R N I N G +// ------------- +// +// This file is not part of the Qt API. It exists purely as an +// implementation detail. This header file may change from version to +// version without notice, or even be removed. +// +// We mean it. +// + +#include <QtCore/qstring.h> +#include <QtCore/qendian.h> +#include <QtCore/qstringconverter.h> + +QT_BEGIN_NAMESPACE + +struct QUtf8BaseTraits +{ + static const bool isTrusted = false; + static const bool allowNonCharacters = true; + static const bool skipAsciiHandling = false; + static const int Error = -1; + static const int EndOfString = -2; + + static bool isValidCharacter(uint u) + { return int(u) >= 0; } + + static void appendByte(uchar *&ptr, uchar b) + { *ptr++ = b; } + + static uchar peekByte(const uchar *ptr, int n = 0) + { return ptr[n]; } + + static qptrdiff availableBytes(const uchar *ptr, const uchar *end) + { return end - ptr; } + + static void advanceByte(const uchar *&ptr, int n = 1) + { ptr += n; } + + static void appendUtf16(ushort *&ptr, ushort uc) + { *ptr++ = uc; } + + static void appendUcs4(ushort *&ptr, uint uc) + { + appendUtf16(ptr, QChar::highSurrogate(uc)); + appendUtf16(ptr, QChar::lowSurrogate(uc)); + } + + static ushort peekUtf16(const ushort *ptr, int n = 0) + { return ptr[n]; } + + static qptrdiff availableUtf16(const ushort *ptr, const ushort *end) + { return end - ptr; } + + static void advanceUtf16(const ushort *&ptr, int n = 1) + { ptr += n; } + + // it's possible to output to UCS-4 too + static void appendUtf16(uint *&ptr, ushort uc) + { *ptr++ = uc; } + + static void appendUcs4(uint *&ptr, uint uc) + { *ptr++ = uc; } +}; + +struct QUtf8BaseTraitsNoAscii : public QUtf8BaseTraits +{ + static const bool skipAsciiHandling = true; +}; + +namespace QUtf8Functions +{ + /// returns 0 on success; errors can only happen if \a u is a surrogate: + /// Error if \a u is a low surrogate; + /// if \a u is a high surrogate, Error if the next isn't a low one, + /// EndOfString if we run into the end of the string. + template <typename Traits, typename OutputPtr, typename InputPtr> inline + int toUtf8(ushort u, OutputPtr &dst, InputPtr &src, InputPtr end) + { + if (!Traits::skipAsciiHandling && u < 0x80) { + // U+0000 to U+007F (US-ASCII) - one byte + Traits::appendByte(dst, uchar(u)); + return 0; + } else if (u < 0x0800) { + // U+0080 to U+07FF - two bytes + // first of two bytes + Traits::appendByte(dst, 0xc0 | uchar(u >> 6)); + } else { + if (!QChar::isSurrogate(u)) { + // U+0800 to U+FFFF (except U+D800-U+DFFF) - three bytes + if (!Traits::allowNonCharacters && QChar::isNonCharacter(u)) + return Traits::Error; + + // first of three bytes + Traits::appendByte(dst, 0xe0 | uchar(u >> 12)); + } else { + // U+10000 to U+10FFFF - four bytes + // need to get one extra codepoint + if (Traits::availableUtf16(src, end) == 0) + return Traits::EndOfString; + + ushort low = Traits::peekUtf16(src); + if (!QChar::isHighSurrogate(u)) + return Traits::Error; + if (!QChar::isLowSurrogate(low)) + return Traits::Error; + + Traits::advanceUtf16(src); + uint ucs4 = QChar::surrogateToUcs4(u, low); + + if (!Traits::allowNonCharacters && QChar::isNonCharacter(ucs4)) + return Traits::Error; + + // first byte + Traits::appendByte(dst, 0xf0 | (uchar(ucs4 >> 18) & 0xf)); + + // second of four bytes + Traits::appendByte(dst, 0x80 | (uchar(ucs4 >> 12) & 0x3f)); + + // for the rest of the bytes + u = ushort(ucs4); + } + + // second to last byte + Traits::appendByte(dst, 0x80 | (uchar(u >> 6) & 0x3f)); + } + + // last byte + Traits::appendByte(dst, 0x80 | (u & 0x3f)); + return 0; + } + + inline bool isContinuationByte(uchar b) + { + return (b & 0xc0) == 0x80; + } + + /// returns the number of characters consumed (including \a b) in case of success; + /// returns negative in case of error: Traits::Error or Traits::EndOfString + template <typename Traits, typename OutputPtr, typename InputPtr> inline + int fromUtf8(uchar b, OutputPtr &dst, InputPtr &src, InputPtr end) + { + int charsNeeded; + uint min_uc; + uint uc; + + if (!Traits::skipAsciiHandling && b < 0x80) { + // US-ASCII + Traits::appendUtf16(dst, b); + return 1; + } + + if (!Traits::isTrusted && Q_UNLIKELY(b <= 0xC1)) { + // an UTF-8 first character must be at least 0xC0 + // however, all 0xC0 and 0xC1 first bytes can only produce overlong sequences + return Traits::Error; + } else if (b < 0xe0) { + charsNeeded = 2; + min_uc = 0x80; + uc = b & 0x1f; + } else if (b < 0xf0) { + charsNeeded = 3; + min_uc = 0x800; + uc = b & 0x0f; + } else if (b < 0xf5) { + charsNeeded = 4; + min_uc = 0x10000; + uc = b & 0x07; + } else { + // the last Unicode character is U+10FFFF + // it's encoded in UTF-8 as "\xF4\x8F\xBF\xBF" + // therefore, a byte higher than 0xF4 is not the UTF-8 first byte + return Traits::Error; + } + + int bytesAvailable = Traits::availableBytes(src, end); + if (Q_UNLIKELY(bytesAvailable < charsNeeded - 1)) { + // it's possible that we have an error instead of just unfinished bytes + if (bytesAvailable > 0 && !isContinuationByte(Traits::peekByte(src, 0))) + return Traits::Error; + if (bytesAvailable > 1 && !isContinuationByte(Traits::peekByte(src, 1))) + return Traits::Error; + return Traits::EndOfString; + } + + // first continuation character + b = Traits::peekByte(src, 0); + if (!isContinuationByte(b)) + return Traits::Error; + uc <<= 6; + uc |= b & 0x3f; + + if (charsNeeded > 2) { + // second continuation character + b = Traits::peekByte(src, 1); + if (!isContinuationByte(b)) + return Traits::Error; + uc <<= 6; + uc |= b & 0x3f; + + if (charsNeeded > 3) { + // third continuation character + b = Traits::peekByte(src, 2); + if (!isContinuationByte(b)) + return Traits::Error; + uc <<= 6; + uc |= b & 0x3f; + } + } + + // we've decoded something; safety-check it + if (!Traits::isTrusted) { + if (uc < min_uc) + return Traits::Error; + if (QChar::isSurrogate(uc) || uc > QChar::LastValidCodePoint) + return Traits::Error; + if (!Traits::allowNonCharacters && QChar::isNonCharacter(uc)) + return Traits::Error; + } + + // write the UTF-16 sequence + if (!QChar::requiresSurrogates(uc)) { + // UTF-8 decoded and no surrogates are required + // detach if necessary + Traits::appendUtf16(dst, ushort(uc)); + } else { + // UTF-8 decoded to something that requires a surrogate pair + Traits::appendUcs4(dst, uc); + } + + Traits::advanceByte(src, charsNeeded - 1); + return charsNeeded; + } +} + +enum DataEndianness +{ + DetectEndianness, + BigEndianness, + LittleEndianness +}; + +struct QUtf8 +{ + static QChar *convertToUnicode(QChar *, const char *, qsizetype) noexcept; + static QString convertToUnicode(const char *, qsizetype); + static QString convertToUnicode(const char *, qsizetype, QStringConverter::State *); + static QByteArray convertFromUnicode(const QChar *, qsizetype); + static QByteArray convertFromUnicode(const QChar *, qsizetype, QStringConverter::State *); + struct ValidUtf8Result { + bool isValidUtf8; + bool isValidAscii; + }; + static ValidUtf8Result isValidUtf8(const char *, qsizetype); + static int compareUtf8(const char *, qsizetype, const QChar *, qsizetype); + static int compareUtf8(const char *, qsizetype, QLatin1String s); +}; + +struct QUtf16 +{ + static QString convertToUnicode(const char *, qsizetype, QStringConverter::State *, DataEndianness = DetectEndianness); + static QByteArray convertFromUnicode(const QChar *, qsizetype, QStringConverter::State *, DataEndianness = DetectEndianness); +}; + +struct QUtf32 +{ + static QString convertToUnicode(const char *, qsizetype, QStringConverter::State *, DataEndianness = DetectEndianness); + static QByteArray convertFromUnicode(const QChar *, qsizetype, QStringConverter::State *, DataEndianness = DetectEndianness); +}; + +/* + Converts from different utf encodings looking at a possible byte order mark at the + beginning of the string. If no BOM exists, utf-8 is assumed. + */ +Q_CORE_EXPORT QString qFromUtfEncoded(const QByteArray &ba); + +QT_END_NAMESPACE + +#endif // QSTRINGCONVERTER_P_H diff --git a/src/corelib/text/text.pri b/src/corelib/text/text.pri index 4c584cf958..1275c014a8 100644 --- a/src/corelib/text/text.pri +++ b/src/corelib/text/text.pri @@ -20,6 +20,7 @@ HEADERS += \ text/qstringalgorithms_p.h \ text/qstringbuilder.h \ text/qstringconverter.h \ + text/qstringconverter_p.h \ text/qstringiterator_p.h \ text/qstringlist.h \ text/qstringliteral.h \ diff --git a/src/gui/kernel/qclipboard.cpp b/src/gui/kernel/qclipboard.cpp index 72f27d3e49..3b42e78624 100644 --- a/src/gui/kernel/qclipboard.cpp +++ b/src/gui/kernel/qclipboard.cpp @@ -46,9 +46,7 @@ #include "qvariant.h" #include "qbuffer.h" #include "qimage.h" -#if QT_CONFIG(textcodec) -#include "private/qutfcodec_p.h" -#endif +#include "private/qstringconverter_p.h" #include "private/qguiapplication_p.h" #include <qpa/qplatformintegration.h> diff --git a/src/tools/bootstrap/.prev_CMakeLists.txt b/src/tools/bootstrap/.prev_CMakeLists.txt index 8f430c494e..f81e03adff 100644 --- a/src/tools/bootstrap/.prev_CMakeLists.txt +++ b/src/tools/bootstrap/.prev_CMakeLists.txt @@ -41,9 +41,6 @@ qt_add_module(Bootstrap ../../3rdparty/pcre2/src/pcre2_ucp.h ../../3rdparty/pcre2/src/pcre2_valid_utf.c ../../3rdparty/pcre2/src/pcre2_xclass.c - ../../corelib/codecs/qlatincodec.cpp - ../../corelib/codecs/qtextcodec.cpp - ../../corelib/codecs/qutfcodec.cpp ../../corelib/global/qendian.cpp ../../corelib/global/qglobal.cpp ../../corelib/global/qlogging.cpp @@ -109,7 +106,6 @@ qt_add_module(Bootstrap ../../corelib/text/qstringbuilder.cpp ../../corelib/text/qstringconverter.cpp ../../corelib/text/qstringlist.cpp - ../../corelib/text/qstringview.cpp ../../corelib/text/qvsnprintf.cpp ../../corelib/time/qcalendar.cpp ../../corelib/time/qdatetime.cpp diff --git a/src/tools/bootstrap/CMakeLists.txt b/src/tools/bootstrap/CMakeLists.txt index 5a17888003..a5184fbb80 100644 --- a/src/tools/bootstrap/CMakeLists.txt +++ b/src/tools/bootstrap/CMakeLists.txt @@ -42,9 +42,6 @@ qt_extend_target(Bootstrap ../../3rdparty/pcre2/src/pcre2_ucp.h ../../3rdparty/pcre2/src/pcre2_valid_utf.c ../../3rdparty/pcre2/src/pcre2_xclass.c - ../../corelib/codecs/qlatincodec.cpp - ../../corelib/codecs/qtextcodec.cpp - ../../corelib/codecs/qutfcodec.cpp ../../corelib/global/qendian.cpp ../../corelib/global/qglobal.cpp ../../corelib/global/qlogging.cpp @@ -110,7 +107,6 @@ qt_extend_target(Bootstrap ../../corelib/text/qstringbuilder.cpp ../../corelib/text/qstringconverter.cpp ../../corelib/text/qstringlist.cpp - ../../corelib/text/qstringview.cpp ../../corelib/text/qvsnprintf.cpp ../../corelib/time/qcalendar.cpp ../../corelib/time/qdatetime.cpp diff --git a/src/tools/bootstrap/bootstrap.pro b/src/tools/bootstrap/bootstrap.pro index 169c5fe1c2..5b7da8e687 100644 --- a/src/tools/bootstrap/bootstrap.pro +++ b/src/tools/bootstrap/bootstrap.pro @@ -28,9 +28,6 @@ INCLUDEPATH += \ $$PWD/../../3rdparty/pcre2/src SOURCES += \ - ../../corelib/codecs/qlatincodec.cpp \ - ../../corelib/codecs/qtextcodec.cpp \ - ../../corelib/codecs/qutfcodec.cpp \ ../../corelib/global/qendian.cpp \ ../../corelib/global/qglobal.cpp \ ../../corelib/global/qlogging.cpp \ @@ -96,7 +93,6 @@ SOURCES += \ ../../corelib/text/qstringconverter.cpp \ ../../corelib/text/qstring_compat.cpp \ ../../corelib/text/qstringlist.cpp \ - ../../corelib/text/qstringview.cpp \ ../../corelib/text/qvsnprintf.cpp \ ../../corelib/time/qcalendar.cpp \ ../../corelib/time/qdatetime.cpp \ |