summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--qmake/CMakeLists.txt2
-rw-r--r--qmake/Makefile.unix11
-rw-r--r--qmake/Makefile.win322
-rw-r--r--qmake/qmake.pro5
-rw-r--r--src/corelib/CMakeLists.txt1
-rw-r--r--src/corelib/codecs/qutfcodec.cpp940
-rw-r--r--src/corelib/codecs/qutfcodec_p.h262
-rw-r--r--src/corelib/global/qconfig-bootstrapped.h5
-rw-r--r--src/corelib/io/qfilesystemiterator_unix.cpp2
-rw-r--r--src/corelib/io/qurlrecode.cpp2
-rw-r--r--src/corelib/serialization/qcborstreamreader.cpp2
-rw-r--r--src/corelib/serialization/qcborvalue_p.h2
-rw-r--r--src/corelib/serialization/qjsonparser.cpp2
-rw-r--r--src/corelib/serialization/qjsonwriter.cpp2
-rw-r--r--src/corelib/text/qstring.cpp2
-rw-r--r--src/corelib/text/qstringbuilder.cpp2
-rw-r--r--src/corelib/text/qstringconverter.cpp949
-rw-r--r--src/corelib/text/qstringconverter_p.h323
-rw-r--r--src/corelib/text/text.pri1
-rw-r--r--src/gui/kernel/qclipboard.cpp4
-rw-r--r--src/tools/bootstrap/.prev_CMakeLists.txt4
-rw-r--r--src/tools/bootstrap/CMakeLists.txt4
-rw-r--r--src/tools/bootstrap/bootstrap.pro4
23 files changed, 1294 insertions, 1239 deletions
diff --git a/qmake/CMakeLists.txt b/qmake/CMakeLists.txt
index d724b44d59..fa7d50f234 100644
--- a/qmake/CMakeLists.txt
+++ b/qmake/CMakeLists.txt
@@ -40,7 +40,6 @@ qt_add_tool(qmake # special case
../src/3rdparty/pcre2/src/pcre2_ucp.h
../src/3rdparty/pcre2/src/pcre2_valid_utf.c
../src/3rdparty/pcre2/src/pcre2_xclass.c
- ../src/corelib/codecs/qutfcodec.cpp ../src/corelib/codecs/qutfcodec_p.h
../src/corelib/global/qendian.cpp # special case
../src/corelib/global/qglobal.cpp ../src/corelib/global/qglobal.h
../src/corelib/global/qlibraryinfo.cpp
@@ -105,6 +104,7 @@ qt_add_tool(qmake # special case
../src/corelib/tools/qringbuffer.cpp # special case
../src/corelib/text/qstring.cpp ../src/corelib/text/qstring.h
../src/corelib/text/qstringbuilder.cpp ../src/corelib/text/qstringbuilder.h
+ ../src/corelib/text/qstringconverter.cpp ../src/corelib/text/qstringconverter.h ../src/corelib/text/qstringconverter_p.h
../src/corelib/text/qstringlist.cpp ../src/corelib/text/qstringlist.h
../src/corelib/text/qstringmatcher.h
../src/corelib/tools/qvector.h
diff --git a/qmake/Makefile.unix b/qmake/Makefile.unix
index 98d255f2d5..c0b6704351 100644
--- a/qmake/Makefile.unix
+++ b/qmake/Makefile.unix
@@ -17,7 +17,6 @@ OBJS = \
#qt code (please keep in order matching DEPEND_SRC)
QOBJS = \
- qutfcodec.o \
qendian.o qglobal.o qlogging.o qmalloc.o qnumeric.o qoperatingsystemversion.o qrandom.o \
qabstractfileengine.o qbuffer.o qdatastream.o qdebug.o \
qdir.o qdiriterator.o \
@@ -32,7 +31,7 @@ QOBJS = \
qcalendar.o qgregoriancalendar.o qromancalendar.o \
qcryptographichash.o qdatetime.o qhash.o \
qlocale.o qlocale_tools.o qmap.o qregularexpression.o qregexp.o qringbuffer.o \
- qstringbuilder.o qstring.o qstringlist.o qversionnumber.o \
+ qstringbuilder.o qstring.o qstringconverter.o qstringlist.o qversionnumber.o \
qvsnprintf.o qxmlstream.o qxmlutils.o \
pcre2_auto_possess.o pcre2_chartables.o pcre2_compile.o pcre2_config.o \
pcre2_context.o pcre2_dfa_match.o pcre2_error.o pcre2_extuni.o \
@@ -74,7 +73,6 @@ DEPEND_SRC = \
$(QMKGENSRC)/win32/msvc_vcxproj.cpp \
$(QMKGENSRC)/win32/winmakefile.cpp \
$(QMKGENSRC)/xmloutput.cpp \
- $(SOURCE_PATH)/src/corelib/codecs/qutfcodec.cpp \
$(SOURCE_PATH)/src/corelib/global/qendian.cpp \
$(SOURCE_PATH)/src/corelib/global/qglobal.cpp \
$(SOURCE_PATH)/src/corelib/global/qlibraryinfo.cpp \
@@ -122,6 +120,7 @@ DEPEND_SRC = \
$(SOURCE_PATH)/src/corelib/text/qregularexpression.cpp \
$(SOURCE_PATH)/src/corelib/text/qregexp.cpp \
$(SOURCE_PATH)/src/corelib/text/qstringbuilder.cpp \
+ $(SOURCE_PATH)/src/corelib/text/qstringconverter.cpp \
$(SOURCE_PATH)/src/corelib/text/qstring.cpp \
$(SOURCE_PATH)/src/corelib/text/qstringlist.cpp \
$(SOURCE_PATH)/src/corelib/text/qvsnprintf.cpp \
@@ -380,15 +379,15 @@ qoperatingsystemversion_darwin.o: $(SOURCE_PATH)/src/corelib/global/qoperatingsy
qcore_foundation.o: $(SOURCE_PATH)/src/corelib/kernel/qcore_foundation.mm
$(CXX) -c -o $@ $(CXXFLAGS) $<
-qutfcodec.o: $(SOURCE_PATH)/src/corelib/codecs/qutfcodec.cpp
- $(CXX) -c -o $@ $(CXXFLAGS) $<
-
qstring.o: $(SOURCE_PATH)/src/corelib/text/qstring.cpp
$(CXX) -c -o $@ $(CXXFLAGS) $<
qstringbuilder.o: $(SOURCE_PATH)/src/corelib/text/qstringbuilder.cpp
$(CXX) -c -o $@ $(CXXFLAGS) $<
+qstringconverter.o: $(SOURCE_PATH)/src/corelib/text/qstringconverter.cpp
+ $(CXX) -c -o $@ $(CXXFLAGS) $<
+
qlocale.o: $(SOURCE_PATH)/src/corelib/text/qlocale.cpp
$(CXX) -c -o $@ $(CXXFLAGS) $<
diff --git a/qmake/Makefile.win32 b/qmake/Makefile.win32
index d3a85c17b2..df47dacd15 100644
--- a/qmake/Makefile.win32
+++ b/qmake/Makefile.win32
@@ -104,8 +104,8 @@ QTOBJS= \
qoperatingsystemversion_win.obj \
qregexp.obj \
qromancalendar.obj \
- qutfcodec.obj \
qstring.obj \
+ qstringconverter.obj \
qstringlist.obj \
qstringbuilder.obj \
qsystemerror.obj \
diff --git a/qmake/qmake.pro b/qmake/qmake.pro
index 243f07ac2c..fcd1c17dcf 100644
--- a/qmake/qmake.pro
+++ b/qmake/qmake.pro
@@ -159,11 +159,11 @@ SOURCES += \
qsettings.cpp \
qstring.cpp \
qstringbuilder.cpp \
+ qstringconverter.cpp \
qstringlist.cpp \
qsystemerror.cpp \
qtemporaryfile.cpp \
qtextstream.cpp \
- qutfcodec.cpp \
quuid.cpp \
qvariant.cpp \
qversionnumber.cpp \
@@ -217,12 +217,13 @@ HEADERS += \
qromancalendar_p.h \
qstring.h \
qstringbuilder.h \
+ qstringconverter_p.h \
+ qstringconverter.h \
qstringlist.h \
qstringmatcher.h \
qsystemerror_p.h \
qtemporaryfile.h \
qtextstream.h \
- qutfcodec_p.h \
quuid.h \
qvector.h \
qversionnumber.h \
diff --git a/src/corelib/CMakeLists.txt b/src/corelib/CMakeLists.txt
index ff28b2d20c..710d025caf 100644
--- a/src/corelib/CMakeLists.txt
+++ b/src/corelib/CMakeLists.txt
@@ -169,6 +169,7 @@ qt_add_module(Core
text/qstring.cpp text/qstring.h
text/qstring_compat.cpp
text/qstringalgorithms.h text/qstringalgorithms_p.h
+ text/qstringconverter.cpp text/qstringconverter.h text/qstringconverter_p.h
text/qstringbuilder.cpp text/qstringbuilder.h
text/qstringiterator_p.h
text/qstringlist.cpp text/qstringlist.h
diff --git a/src/corelib/codecs/qutfcodec.cpp b/src/corelib/codecs/qutfcodec.cpp
index a31bfbd218..c518ab1d9c 100644
--- a/src/corelib/codecs/qutfcodec.cpp
+++ b/src/corelib/codecs/qutfcodec.cpp
@@ -48,946 +48,6 @@
QT_BEGIN_NAMESPACE
-enum { Endian = 0, Data = 1 };
-
-static const uchar utf8bom[] = { 0xef, 0xbb, 0xbf };
-
-#if (defined(__SSE2__) && defined(QT_COMPILER_SUPPORTS_SSE2)) \
- || (defined(__ARM_NEON__) && defined(Q_PROCESSOR_ARM_64))
-static Q_ALWAYS_INLINE uint qBitScanReverse(unsigned v) noexcept
-{
- uint result = qCountLeadingZeroBits(v);
- // Now Invert the result: clz will count *down* from the msb to the lsb, so the msb index is 31
- // and the lsb index is 0. The result for _bit_scan_reverse is expected to be the index when
- // counting up: msb index is 0 (because it starts there), and the lsb index is 31.
- result ^= sizeof(unsigned) * 8 - 1;
- return result;
-}
-#endif
-
-#if defined(__SSE2__) && defined(QT_COMPILER_SUPPORTS_SSE2)
-static inline bool simdEncodeAscii(uchar *&dst, const ushort *&nextAscii, const ushort *&src, const ushort *end)
-{
- // do sixteen characters at a time
- for ( ; end - src >= 16; src += 16, dst += 16) {
-# ifdef __AVX2__
- __m256i data = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src));
- __m128i data1 = _mm256_castsi256_si128(data);
- __m128i data2 = _mm256_extracti128_si256(data, 1);
-# else
- __m128i data1 = _mm_loadu_si128((const __m128i*)src);
- __m128i data2 = _mm_loadu_si128(1+(const __m128i*)src);
-# endif
-
- // check if everything is ASCII
- // the highest ASCII value is U+007F
- // Do the packing directly:
- // The PACKUSWB instruction has packs a signed 16-bit integer to an unsigned 8-bit
- // with saturation. That is, anything from 0x0100 to 0x7fff is saturated to 0xff,
- // while all negatives (0x8000 to 0xffff) get saturated to 0x00. To detect non-ASCII,
- // we simply do a signed greater-than comparison to 0x00. That means we detect NULs as
- // "non-ASCII", but it's an acceptable compromise.
- __m128i packed = _mm_packus_epi16(data1, data2);
- __m128i nonAscii = _mm_cmpgt_epi8(packed, _mm_setzero_si128());
-
- // store, even if there are non-ASCII characters here
- _mm_storeu_si128((__m128i*)dst, packed);
-
- // n will contain 1 bit set per character in [data1, data2] that is non-ASCII (or NUL)
- ushort n = ~_mm_movemask_epi8(nonAscii);
- if (n) {
- // find the next probable ASCII character
- // we don't want to load 32 bytes again in this loop if we know there are non-ASCII
- // characters still coming
- nextAscii = src + qBitScanReverse(n) + 1;
-
- n = qCountTrailingZeroBits(n);
- dst += n;
- src += n;
- return false;
- }
- }
-
- if (end - src >= 8) {
- // do eight characters at a time
- __m128i data = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src));
- __m128i packed = _mm_packus_epi16(data, data);
- __m128i nonAscii = _mm_cmpgt_epi8(packed, _mm_setzero_si128());
-
- // store even non-ASCII
- _mm_storel_epi64(reinterpret_cast<__m128i *>(dst), packed);
-
- uchar n = ~_mm_movemask_epi8(nonAscii);
- if (n) {
- nextAscii = src + qBitScanReverse(n) + 1;
- n = qCountTrailingZeroBits(n);
- dst += n;
- src += n;
- return false;
- }
- }
-
- return src == end;
-}
-
-static inline bool simdDecodeAscii(ushort *&dst, const uchar *&nextAscii, const uchar *&src, const uchar *end)
-{
- // do sixteen characters at a time
- for ( ; end - src >= 16; src += 16, dst += 16) {
- __m128i data = _mm_loadu_si128((const __m128i*)src);
-
-#ifdef __AVX2__
- const int BitSpacing = 2;
- // load and zero extend to an YMM register
- const __m256i extended = _mm256_cvtepu8_epi16(data);
-
- uint n = _mm256_movemask_epi8(extended);
- if (!n) {
- // store
- _mm256_storeu_si256((__m256i*)dst, extended);
- continue;
- }
-#else
- const int BitSpacing = 1;
-
- // check if everything is ASCII
- // movemask extracts the high bit of every byte, so n is non-zero if something isn't ASCII
- uint n = _mm_movemask_epi8(data);
- if (!n) {
- // unpack
- _mm_storeu_si128((__m128i*)dst, _mm_unpacklo_epi8(data, _mm_setzero_si128()));
- _mm_storeu_si128(1+(__m128i*)dst, _mm_unpackhi_epi8(data, _mm_setzero_si128()));
- continue;
- }
-#endif
-
- // copy the front part that is still ASCII
- while (!(n & 1)) {
- *dst++ = *src++;
- n >>= BitSpacing;
- }
-
- // find the next probable ASCII character
- // we don't want to load 16 bytes again in this loop if we know there are non-ASCII
- // characters still coming
- n = qBitScanReverse(n);
- nextAscii = src + (n / BitSpacing) + 1;
- return false;
-
- }
-
- if (end - src >= 8) {
- __m128i data = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(src));
- uint n = _mm_movemask_epi8(data) & 0xff;
- if (!n) {
- // unpack and store
- _mm_storeu_si128(reinterpret_cast<__m128i *>(dst), _mm_unpacklo_epi8(data, _mm_setzero_si128()));
- } else {
- while (!(n & 1)) {
- *dst++ = *src++;
- n >>= 1;
- }
-
- n = qBitScanReverse(n);
- nextAscii = src + n + 1;
- return false;
- }
- }
-
- return src == end;
-}
-
-static inline const uchar *simdFindNonAscii(const uchar *src, const uchar *end, const uchar *&nextAscii)
-{
-#ifdef __AVX2__
- // do 32 characters at a time
- // (this is similar to simdTestMask in qstring.cpp)
- const __m256i mask = _mm256_set1_epi8(0x80);
- for ( ; end - src >= 32; src += 32) {
- __m256i data = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src));
- if (_mm256_testz_si256(mask, data))
- continue;
-
- uint n = _mm256_movemask_epi8(data);
- Q_ASSUME(n);
-
- // find the next probable ASCII character
- // we don't want to load 32 bytes again in this loop if we know there are non-ASCII
- // characters still coming
- nextAscii = src + qBitScanReverse(n) + 1;
-
- // return the non-ASCII character
- return src + qCountTrailingZeroBits(n);
- }
-#endif
-
- // do sixteen characters at a time
- for ( ; end - src >= 16; src += 16) {
- __m128i data = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src));
-
- // check if everything is ASCII
- // movemask extracts the high bit of every byte, so n is non-zero if something isn't ASCII
- uint n = _mm_movemask_epi8(data);
- if (!n)
- continue;
-
- // find the next probable ASCII character
- // we don't want to load 16 bytes again in this loop if we know there are non-ASCII
- // characters still coming
- nextAscii = src + qBitScanReverse(n) + 1;
-
- // return the non-ASCII character
- return src + qCountTrailingZeroBits(n);
- }
-
- // do four characters at a time
- for ( ; end - src >= 4; src += 4) {
- quint32 data = qFromUnaligned<quint32>(src);
- data &= 0x80808080U;
- if (!data)
- continue;
-
- // We don't try to guess which of the three bytes is ASCII and which
- // one isn't. The chance that at least two of them are non-ASCII is
- // better than 75%.
- nextAscii = src;
- return src;
- }
- nextAscii = end;
- return src;
-}
-#elif defined(__ARM_NEON__) && defined(Q_PROCESSOR_ARM_64) // vaddv is only available on Aarch64
-static inline bool simdEncodeAscii(uchar *&dst, const ushort *&nextAscii, const ushort *&src, const ushort *end)
-{
- uint16x8_t maxAscii = vdupq_n_u16(0x7f);
- uint16x8_t mask1 = { 1, 1 << 2, 1 << 4, 1 << 6, 1 << 8, 1 << 10, 1 << 12, 1 << 14 };
- uint16x8_t mask2 = vshlq_n_u16(mask1, 1);
-
- // do sixteen characters at a time
- for ( ; end - src >= 16; src += 16, dst += 16) {
- // load 2 lanes (or: "load interleaved")
- uint16x8x2_t in = vld2q_u16(src);
-
- // check if any of the elements > 0x7f, select 1 bit per element (element 0 -> bit 0, element 1 -> bit 1, etc),
- // add those together into a scalar, and merge the scalars.
- uint16_t nonAscii = vaddvq_u16(vandq_u16(vcgtq_u16(in.val[0], maxAscii), mask1))
- | vaddvq_u16(vandq_u16(vcgtq_u16(in.val[1], maxAscii), mask2));
-
- // merge the two lanes by shifting the values of the second by 8 and inserting them
- uint16x8_t out = vsliq_n_u16(in.val[0], in.val[1], 8);
-
- // store, even if there are non-ASCII characters here
- vst1q_u8(dst, vreinterpretq_u8_u16(out));
-
- if (nonAscii) {
- // find the next probable ASCII character
- // we don't want to load 32 bytes again in this loop if we know there are non-ASCII
- // characters still coming
- nextAscii = src + qBitScanReverse(nonAscii) + 1;
-
- nonAscii = qCountTrailingZeroBits(nonAscii);
- dst += nonAscii;
- src += nonAscii;
- return false;
- }
- }
- return src == end;
-}
-
-static inline bool simdDecodeAscii(ushort *&dst, const uchar *&nextAscii, const uchar *&src, const uchar *end)
-{
- // do eight characters at a time
- uint8x8_t msb_mask = vdup_n_u8(0x80);
- uint8x8_t add_mask = { 1, 1 << 1, 1 << 2, 1 << 3, 1 << 4, 1 << 5, 1 << 6, 1 << 7 };
- for ( ; end - src >= 8; src += 8, dst += 8) {
- uint8x8_t c = vld1_u8(src);
- uint8_t n = vaddv_u8(vand_u8(vcge_u8(c, msb_mask), add_mask));
- if (!n) {
- // store
- vst1q_u16(dst, vmovl_u8(c));
- continue;
- }
-
- // copy the front part that is still ASCII
- while (!(n & 1)) {
- *dst++ = *src++;
- n >>= 1;
- }
-
- // find the next probable ASCII character
- // we don't want to load 16 bytes again in this loop if we know there are non-ASCII
- // characters still coming
- n = qBitScanReverse(n);
- nextAscii = src + n + 1;
- return false;
-
- }
- return src == end;
-}
-
-static inline const uchar *simdFindNonAscii(const uchar *src, const uchar *end, const uchar *&nextAscii)
-{
- // The SIMD code below is untested, so just force an early return until
- // we've had the time to verify it works.
- nextAscii = end;
- return src;
-
- // do eight characters at a time
- uint8x8_t msb_mask = vdup_n_u8(0x80);
- uint8x8_t add_mask = { 1, 1 << 1, 1 << 2, 1 << 3, 1 << 4, 1 << 5, 1 << 6, 1 << 7 };
- for ( ; end - src >= 8; src += 8) {
- uint8x8_t c = vld1_u8(src);
- uint8_t n = vaddv_u8(vand_u8(vcge_u8(c, msb_mask), add_mask));
- if (!n)
- continue;
-
- // find the next probable ASCII character
- // we don't want to load 16 bytes again in this loop if we know there are non-ASCII
- // characters still coming
- nextAscii = src + qBitScanReverse(n) + 1;
-
- // return the non-ASCII character
- return src + qCountTrailingZeroBits(n);
- }
- nextAscii = end;
- return src;
-}
-#else
-static inline bool simdEncodeAscii(uchar *, const ushort *, const ushort *, const ushort *)
-{
- return false;
-}
-
-static inline bool simdDecodeAscii(ushort *, const uchar *, const uchar *, const uchar *)
-{
- return false;
-}
-
-static inline const uchar *simdFindNonAscii(const uchar *src, const uchar *end, const uchar *&nextAscii)
-{
- nextAscii = end;
- return src;
-}
-#endif
-
-QByteArray QUtf8::convertFromUnicode(const QChar *uc, int len)
-{
- // create a QByteArray with the worst case scenario size
- QByteArray result(len * 3, Qt::Uninitialized);
- uchar *dst = reinterpret_cast<uchar *>(const_cast<char *>(result.constData()));
- const ushort *src = reinterpret_cast<const ushort *>(uc);
- const ushort *const end = src + len;
-
- while (src != end) {
- const ushort *nextAscii = end;
- if (simdEncodeAscii(dst, nextAscii, src, end))
- break;
-
- do {
- ushort uc = *src++;
- int res = QUtf8Functions::toUtf8<QUtf8BaseTraits>(uc, dst, src, end);
- if (res < 0) {
- // encoding error - append '?'
- *dst++ = '?';
- }
- } while (src < nextAscii);
- }
-
- result.truncate(dst - reinterpret_cast<uchar *>(const_cast<char *>(result.constData())));
- return result;
-}
-
-QByteArray QUtf8::convertFromUnicode(const QChar *uc, int len, QTextCodec::ConverterState *state)
-{
- uchar replacement = '?';
- int rlen = 3*len;
- int surrogate_high = -1;
- if (state) {
- if (state->flags & QTextCodec::ConvertInvalidToNull)
- replacement = 0;
- if (!(state->flags & QTextCodec::IgnoreHeader))
- rlen += 3;
- if (state->remainingChars)
- surrogate_high = state->state_data[0];
- }
-
-
- QByteArray rstr(rlen, Qt::Uninitialized);
- uchar *cursor = reinterpret_cast<uchar *>(const_cast<char *>(rstr.constData()));
- const ushort *src = reinterpret_cast<const ushort *>(uc);
- const ushort *const end = src + len;
-
- int invalid = 0;
- if (state && !(state->flags & QTextCodec::IgnoreHeader)) {
- // append UTF-8 BOM
- *cursor++ = utf8bom[0];
- *cursor++ = utf8bom[1];
- *cursor++ = utf8bom[2];
- }
-
- const ushort *nextAscii = src;
- while (src != end) {
- int res;
- ushort uc;
- if (surrogate_high != -1) {
- uc = surrogate_high;
- surrogate_high = -1;
- res = QUtf8Functions::toUtf8<QUtf8BaseTraits>(uc, cursor, src, end);
- } else {
- if (src >= nextAscii && simdEncodeAscii(cursor, nextAscii, src, end))
- break;
-
- uc = *src++;
- res = QUtf8Functions::toUtf8<QUtf8BaseTraits>(uc, cursor, src, end);
- }
- if (Q_LIKELY(res >= 0))
- continue;
-
- if (res == QUtf8BaseTraits::Error) {
- // encoding error
- ++invalid;
- *cursor++ = replacement;
- } else if (res == QUtf8BaseTraits::EndOfString) {
- surrogate_high = uc;
- break;
- }
- }
-
- rstr.resize(cursor - (const uchar*)rstr.constData());
- if (state) {
- state->invalidChars += invalid;
- state->flags |= QTextCodec::IgnoreHeader;
- state->remainingChars = 0;
- if (surrogate_high >= 0) {
- state->remainingChars = 1;
- state->state_data[0] = surrogate_high;
- }
- }
- return rstr;
-}
-
-QString QUtf8::convertToUnicode(const char *chars, int len)
-{
- // UTF-8 to UTF-16 always needs the exact same number of words or less:
- // UTF-8 UTF-16
- // 1 byte 1 word
- // 2 bytes 1 word
- // 3 bytes 1 word
- // 4 bytes 2 words (one surrogate pair)
- // That is, we'll use the full buffer if the input is US-ASCII (1-byte UTF-8),
- // half the buffer for U+0080-U+07FF text (e.g., Greek, Cyrillic, Arabic) or
- // non-BMP text, and one third of the buffer for U+0800-U+FFFF text (e.g, CJK).
- //
- // The table holds for invalid sequences too: we'll insert one replacement char
- // per invalid byte.
- QString result(len, Qt::Uninitialized);
- QChar *data = const_cast<QChar*>(result.constData()); // we know we're not shared
- const QChar *end = convertToUnicode(data, chars, len);
- result.truncate(end - data);
- return result;
-}
-
-/*!
- \since 5.7
- \overload
-
- Converts the UTF-8 sequence of \a len octets beginning at \a chars to
- a sequence of QChar starting at \a buffer. The buffer is expected to be
- large enough to hold the result. An upper bound for the size of the
- buffer is \a len QChars.
-
- If, during decoding, an error occurs, a QChar::ReplacementCharacter is
- written.
-
- Returns a pointer to one past the last QChar written.
-
- This function never throws.
-*/
-
-QChar *QUtf8::convertToUnicode(QChar *buffer, const char *chars, int len) noexcept
-{
- ushort *dst = reinterpret_cast<ushort *>(buffer);
- const uchar *src = reinterpret_cast<const uchar *>(chars);
- const uchar *end = src + len;
-
- // attempt to do a full decoding in SIMD
- const uchar *nextAscii = end;
- if (!simdDecodeAscii(dst, nextAscii, src, end)) {
- // at least one non-ASCII entry
- // check if we failed to decode the UTF-8 BOM; if so, skip it
- if (Q_UNLIKELY(src == reinterpret_cast<const uchar *>(chars))
- && end - src >= 3
- && Q_UNLIKELY(src[0] == utf8bom[0] && src[1] == utf8bom[1] && src[2] == utf8bom[2])) {
- src += 3;
- }
-
- while (src < end) {
- nextAscii = end;
- if (simdDecodeAscii(dst, nextAscii, src, end))
- break;
-
- do {
- uchar b = *src++;
- int res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(b, dst, src, end);
- if (res < 0) {
- // decoding error
- *dst++ = QChar::ReplacementCharacter;
- }
- } while (src < nextAscii);
- }
- }
-
- return reinterpret_cast<QChar *>(dst);
-}
-
-QString QUtf8::convertToUnicode(const char *chars, int len, QTextCodec::ConverterState *state)
-{
- bool headerdone = false;
- ushort replacement = QChar::ReplacementCharacter;
- int invalid = 0;
- int res;
- uchar ch = 0;
-
- // See above for buffer requirements for stateless decoding. However, that
- // fails if the state is not empty. The following situations can add to the
- // requirements:
- // state contains chars starts with requirement
- // 1 of 2 bytes valid continuation 0
- // 2 of 3 bytes same 0
- // 3 bytes of 4 same +1 (need to insert surrogate pair)
- // 1 of 2 bytes invalid continuation +1 (need to insert replacement and restart)
- // 2 of 3 bytes same +1 (same)
- // 3 of 4 bytes same +1 (same)
- QString result(len + 1, Qt::Uninitialized);
-
- ushort *dst = reinterpret_cast<ushort *>(const_cast<QChar *>(result.constData()));
- const uchar *src = reinterpret_cast<const uchar *>(chars);
- const uchar *end = src + len;
-
- if (state) {
- if (state->flags & QTextCodec::IgnoreHeader)
- headerdone = true;
- if (state->flags & QTextCodec::ConvertInvalidToNull)
- replacement = QChar::Null;
- if (state->remainingChars) {
- // handle incoming state first
- uchar remainingCharsData[4]; // longest UTF-8 sequence possible
- int remainingCharsCount = state->remainingChars;
- int newCharsToCopy = qMin<int>(sizeof(remainingCharsData) - remainingCharsCount, end - src);
-
- memset(remainingCharsData, 0, sizeof(remainingCharsData));
- memcpy(remainingCharsData, &state->state_data[0], remainingCharsCount);
- memcpy(remainingCharsData + remainingCharsCount, src, newCharsToCopy);
-
- const uchar *begin = &remainingCharsData[1];
- res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(remainingCharsData[0], dst, begin,
- static_cast<const uchar *>(remainingCharsData) + remainingCharsCount + newCharsToCopy);
- if (res == QUtf8BaseTraits::Error || (res == QUtf8BaseTraits::EndOfString && len == 0)) {
- // special case for len == 0:
- // if we were supplied an empty string, terminate the previous, unfinished sequence with error
- ++invalid;
- *dst++ = replacement;
- } else if (res == QUtf8BaseTraits::EndOfString) {
- // if we got EndOfString again, then there were too few bytes in src;
- // copy to our state and return
- state->remainingChars = remainingCharsCount + newCharsToCopy;
- memcpy(&state->state_data[0], remainingCharsData, state->remainingChars);
- return QString();
- } else if (!headerdone && res >= 0) {
- // eat the UTF-8 BOM
- headerdone = true;
- if (dst[-1] == 0xfeff)
- --dst;
- }
-
- // adjust src now that we have maybe consumed a few chars
- if (res >= 0) {
- Q_ASSERT(res > remainingCharsCount);
- src += res - remainingCharsCount;
- }
- }
- }
-
- // main body, stateless decoding
- res = 0;
- const uchar *nextAscii = src;
- const uchar *start = src;
- while (res >= 0 && src < end) {
- if (src >= nextAscii && simdDecodeAscii(dst, nextAscii, src, end))
- break;
-
- ch = *src++;
- res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(ch, dst, src, end);
- if (!headerdone && res >= 0) {
- headerdone = true;
- if (src == start + 3) { // 3 == sizeof(utf8-bom)
- // eat the UTF-8 BOM (it can only appear at the beginning of the string).
- if (dst[-1] == 0xfeff)
- --dst;
- }
- }
- if (res == QUtf8BaseTraits::Error) {
- res = 0;
- ++invalid;
- *dst++ = replacement;
- }
- }
-
- if (!state && res == QUtf8BaseTraits::EndOfString) {
- // unterminated UTF sequence
- *dst++ = QChar::ReplacementCharacter;
- while (src++ < end)
- *dst++ = QChar::ReplacementCharacter;
- }
-
- result.truncate(dst - (const ushort *)result.unicode());
- if (state) {
- state->invalidChars += invalid;
- if (headerdone)
- state->flags |= QTextCodec::IgnoreHeader;
- if (res == QUtf8BaseTraits::EndOfString) {
- --src; // unread the byte in ch
- state->remainingChars = end - src;
- memcpy(&state->state_data[0], src, end - src);
- } else {
- state->remainingChars = 0;
- }
- }
- return result;
-}
-
-struct QUtf8NoOutputTraits : public QUtf8BaseTraitsNoAscii
-{
- struct NoOutput {};
- static void appendUtf16(const NoOutput &, ushort) {}
- static void appendUcs4(const NoOutput &, uint) {}
-};
-
-QUtf8::ValidUtf8Result QUtf8::isValidUtf8(const char *chars, qsizetype len)
-{
- const uchar *src = reinterpret_cast<const uchar *>(chars);
- const uchar *end = src + len;
- const uchar *nextAscii = src;
- bool isValidAscii = true;
-
- while (src < end) {
- if (src >= nextAscii)
- src = simdFindNonAscii(src, end, nextAscii);
- if (src == end)
- break;
-
- do {
- uchar b = *src++;
- if ((b & 0x80) == 0)
- continue;
-
- isValidAscii = false;
- QUtf8NoOutputTraits::NoOutput output;
- int res = QUtf8Functions::fromUtf8<QUtf8NoOutputTraits>(b, output, src, end);
- if (res < 0) {
- // decoding error
- return { false, false };
- }
- } while (src < nextAscii);
- }
-
- return { true, isValidAscii };
-}
-
-int QUtf8::compareUtf8(const char *utf8, qsizetype u8len, const QChar *utf16, int u16len)
-{
- uint uc1, uc2;
- auto src1 = reinterpret_cast<const uchar *>(utf8);
- auto end1 = src1 + u8len;
- QStringIterator src2(utf16, utf16 + u16len);
-
- while (src1 < end1 && src2.hasNext()) {
- uchar b = *src1++;
- uint *output = &uc1;
- int res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(b, output, src1, end1);
- if (res < 0) {
- // decoding error
- uc1 = QChar::ReplacementCharacter;
- }
-
- uc2 = src2.next();
- if (uc1 != uc2)
- return int(uc1) - int(uc2);
- }
-
- // the shorter string sorts first
- return (end1 > src1) - int(src2.hasNext());
-}
-
-int QUtf8::compareUtf8(const char *utf8, qsizetype u8len, QLatin1String s)
-{
- uint uc1;
- auto src1 = reinterpret_cast<const uchar *>(utf8);
- auto end1 = src1 + u8len;
- auto src2 = reinterpret_cast<const uchar *>(s.latin1());
- auto end2 = src2 + s.size();
-
- while (src1 < end1 && src2 < end2) {
- uchar b = *src1++;
- uint *output = &uc1;
- int res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(b, output, src1, end1);
- if (res < 0) {
- // decoding error
- uc1 = QChar::ReplacementCharacter;
- }
-
- uint uc2 = *src2++;
- if (uc1 != uc2)
- return int(uc1) - int(uc2);
- }
-
- // the shorter string sorts first
- return (end1 > src1) - (end2 > src2);
-}
-
-QByteArray QUtf16::convertFromUnicode(const QChar *uc, int len, QTextCodec::ConverterState *state, DataEndianness e)
-{
- DataEndianness endian = e;
- int length = 2*len;
- if (!state || (!(state->flags & QTextCodec::IgnoreHeader))) {
- length += 2;
- }
- if (e == DetectEndianness) {
- endian = (QSysInfo::ByteOrder == QSysInfo::BigEndian) ? BigEndianness : LittleEndianness;
- }
-
- QByteArray d;
- d.resize(length);
- char *data = d.data();
- if (!state || !(state->flags & QTextCodec::IgnoreHeader)) {
- QChar bom(QChar::ByteOrderMark);
- if (endian == BigEndianness)
- qToBigEndian(bom.unicode(), data);
- else
- qToLittleEndian(bom.unicode(), data);
- data += 2;
- }
- if (endian == BigEndianness)
- qToBigEndian<ushort>(uc, len, data);
- else
- qToLittleEndian<ushort>(uc, len, data);
-
- if (state) {
- state->remainingChars = 0;
- state->flags |= QTextCodec::IgnoreHeader;
- }
- return d;
-}
-
-QString QUtf16::convertToUnicode(const char *chars, int len, QTextCodec::ConverterState *state, DataEndianness e)
-{
- DataEndianness endian = e;
- bool half = false;
- uchar buf = 0;
- bool headerdone = false;
- if (state) {
- headerdone = state->flags & QTextCodec::IgnoreHeader;
- if (endian == DetectEndianness)
- endian = (DataEndianness)state->state_data[Endian];
- if (state->remainingChars) {
- half = true;
- buf = state->state_data[Data];
- }
- }
- if (headerdone && endian == DetectEndianness)
- endian = (QSysInfo::ByteOrder == QSysInfo::BigEndian) ? BigEndianness : LittleEndianness;
-
- QString result(len, Qt::Uninitialized); // worst case
- QChar *qch = (QChar *)result.data();
- while (len--) {
- if (half) {
- QChar ch;
- if (endian == LittleEndianness) {
- ch.setRow(*chars++);
- ch.setCell(buf);
- } else {
- ch.setRow(buf);
- ch.setCell(*chars++);
- }
- if (!headerdone) {
- headerdone = true;
- if (endian == DetectEndianness) {
- if (ch == QChar::ByteOrderSwapped) {
- endian = LittleEndianness;
- } else if (ch == QChar::ByteOrderMark) {
- endian = BigEndianness;
- } else {
- if (QSysInfo::ByteOrder == QSysInfo::BigEndian) {
- endian = BigEndianness;
- } else {
- endian = LittleEndianness;
- ch = QChar::fromUcs2((ch.unicode() >> 8) | ((ch.unicode() & 0xff) << 8));
- }
- *qch++ = ch;
- }
- } else if (ch != QChar::ByteOrderMark) {
- *qch++ = ch;
- }
- } else {
- *qch++ = ch;
- }
- half = false;
- } else {
- buf = *chars++;
- half = true;
- }
- }
- result.truncate(qch - result.unicode());
-
- if (state) {
- if (headerdone)
- state->flags |= QTextCodec::IgnoreHeader;
- state->state_data[Endian] = endian;
- if (half) {
- state->remainingChars = 1;
- state->state_data[Data] = buf;
- } else {
- state->remainingChars = 0;
- state->state_data[Data] = 0;
- }
- }
- return result;
-}
-
-QByteArray QUtf32::convertFromUnicode(const QChar *uc, int len, QTextCodec::ConverterState *state, DataEndianness e)
-{
- DataEndianness endian = e;
- int length = 4*len;
- if (!state || (!(state->flags & QTextCodec::IgnoreHeader))) {
- length += 4;
- }
- if (e == DetectEndianness) {
- endian = (QSysInfo::ByteOrder == QSysInfo::BigEndian) ? BigEndianness : LittleEndianness;
- }
-
- QByteArray d(length, Qt::Uninitialized);
- char *data = d.data();
- if (!state || !(state->flags & QTextCodec::IgnoreHeader)) {
- if (endian == BigEndianness) {
- data[0] = 0;
- data[1] = 0;
- data[2] = (char)0xfe;
- data[3] = (char)0xff;
- } else {
- data[0] = (char)0xff;
- data[1] = (char)0xfe;
- data[2] = 0;
- data[3] = 0;
- }
- data += 4;
- }
-
- QStringIterator i(uc, uc + len);
- if (endian == BigEndianness) {
- while (i.hasNext()) {
- uint cp = i.next();
- qToBigEndian(cp, data);
- data += 4;
- }
- } else {
- while (i.hasNext()) {
- uint cp = i.next();
- qToLittleEndian(cp, data);
- data += 4;
- }
- }
-
- if (state) {
- state->remainingChars = 0;
- state->flags |= QTextCodec::IgnoreHeader;
- }
- return d;
-}
-
-QString QUtf32::convertToUnicode(const char *chars, int len, QTextCodec::ConverterState *state, DataEndianness e)
-{
- DataEndianness endian = e;
- uchar tuple[4];
- int num = 0;
- bool headerdone = false;
- if (state) {
- headerdone = state->flags & QTextCodec::IgnoreHeader;
- if (endian == DetectEndianness) {
- endian = (DataEndianness)state->state_data[Endian];
- }
- num = state->remainingChars;
- memcpy(tuple, &state->state_data[Data], 4);
- }
- if (headerdone && endian == DetectEndianness)
- endian = (QSysInfo::ByteOrder == QSysInfo::BigEndian) ? BigEndianness : LittleEndianness;
-
- QString result;
- result.resize((num + len) >> 2 << 1); // worst case
- QChar *qch = (QChar *)result.data();
-
- const char *end = chars + len;
- while (chars < end) {
- tuple[num++] = *chars++;
- if (num == 4) {
- if (!headerdone) {
- headerdone = true;
- if (endian == DetectEndianness) {
- if (tuple[0] == 0xff && tuple[1] == 0xfe && tuple[2] == 0 && tuple[3] == 0 && endian != BigEndianness) {
- endian = LittleEndianness;
- num = 0;
- continue;
- } else if (tuple[0] == 0 && tuple[1] == 0 && tuple[2] == 0xfe && tuple[3] == 0xff && endian != LittleEndianness) {
- endian = BigEndianness;
- num = 0;
- continue;
- } else if (QSysInfo::ByteOrder == QSysInfo::BigEndian) {
- endian = BigEndianness;
- } else {
- endian = LittleEndianness;
- }
- } else if (((endian == BigEndianness) ? qFromBigEndian<quint32>(tuple) : qFromLittleEndian<quint32>(tuple)) == QChar::ByteOrderMark) {
- num = 0;
- continue;
- }
- }
- uint code = (endian == BigEndianness) ? qFromBigEndian<quint32>(tuple) : qFromLittleEndian<quint32>(tuple);
- for (char16_t c : QChar::fromUcs4(code))
- *qch++ = c;
- num = 0;
- }
- }
- result.truncate(qch - result.unicode());
-
- if (state) {
- if (headerdone)
- state->flags |= QTextCodec::IgnoreHeader;
- state->state_data[Endian] = endian;
- state->remainingChars = num;
- memcpy(&state->state_data[Data], tuple, 4);
- }
- return result;
-}
-
-QString qFromUtfEncoded(const QByteArray &ba)
-{
- const int arraySize = ba.size();
- const uchar *buf = reinterpret_cast<const uchar *>(ba.constData());
- const uint bom = 0xfeff;
-
- if (arraySize > 3) {
- uint uc = qFromUnaligned<uint>(buf);
- if (uc == qToBigEndian(bom) || uc == qToLittleEndian(bom))
- return QUtf32::convertToUnicode(ba.constData(), ba.length(), nullptr); // utf-32
- }
-
- if (arraySize > 1) {
- ushort uc = qFromUnaligned<ushort>(buf);
- if (uc == qToBigEndian(ushort(bom)) || qToLittleEndian(ushort(bom)))
- return QUtf16::convertToUnicode(ba.constData(), ba.length(), nullptr); // utf-16
- }
- return QUtf8::convertToUnicode(ba.constData(), ba.length());
-}
-
#if QT_CONFIG(textcodec)
QUtf8Codec::~QUtf8Codec()
diff --git a/src/corelib/codecs/qutfcodec_p.h b/src/corelib/codecs/qutfcodec_p.h
index b1c7a23d4f..893a6db8e1 100644
--- a/src/corelib/codecs/qutfcodec_p.h
+++ b/src/corelib/codecs/qutfcodec_p.h
@@ -60,271 +60,11 @@
#include "QtCore/qtextcodec.h"
#endif
+#include "private/qstringconverter_p.h"
#include "private/qtextcodec_p.h"
QT_BEGIN_NAMESPACE
-struct QUtf8BaseTraits
-{
- static const bool isTrusted = false;
- static const bool allowNonCharacters = true;
- static const bool skipAsciiHandling = false;
- static const int Error = -1;
- static const int EndOfString = -2;
-
- static bool isValidCharacter(uint u)
- { return int(u) >= 0; }
-
- static void appendByte(uchar *&ptr, uchar b)
- { *ptr++ = b; }
-
- static uchar peekByte(const uchar *ptr, int n = 0)
- { return ptr[n]; }
-
- static qptrdiff availableBytes(const uchar *ptr, const uchar *end)
- { return end - ptr; }
-
- static void advanceByte(const uchar *&ptr, int n = 1)
- { ptr += n; }
-
- static void appendUtf16(ushort *&ptr, ushort uc)
- { *ptr++ = uc; }
-
- static void appendUcs4(ushort *&ptr, uint uc)
- {
- appendUtf16(ptr, QChar::highSurrogate(uc));
- appendUtf16(ptr, QChar::lowSurrogate(uc));
- }
-
- static ushort peekUtf16(const ushort *ptr, int n = 0)
- { return ptr[n]; }
-
- static qptrdiff availableUtf16(const ushort *ptr, const ushort *end)
- { return end - ptr; }
-
- static void advanceUtf16(const ushort *&ptr, int n = 1)
- { ptr += n; }
-
- // it's possible to output to UCS-4 too
- static void appendUtf16(uint *&ptr, ushort uc)
- { *ptr++ = uc; }
-
- static void appendUcs4(uint *&ptr, uint uc)
- { *ptr++ = uc; }
-};
-
-struct QUtf8BaseTraitsNoAscii : public QUtf8BaseTraits
-{
- static const bool skipAsciiHandling = true;
-};
-
-namespace QUtf8Functions
-{
- /// returns 0 on success; errors can only happen if \a u is a surrogate:
- /// Error if \a u is a low surrogate;
- /// if \a u is a high surrogate, Error if the next isn't a low one,
- /// EndOfString if we run into the end of the string.
- template <typename Traits, typename OutputPtr, typename InputPtr> inline
- int toUtf8(ushort u, OutputPtr &dst, InputPtr &src, InputPtr end)
- {
- if (!Traits::skipAsciiHandling && u < 0x80) {
- // U+0000 to U+007F (US-ASCII) - one byte
- Traits::appendByte(dst, uchar(u));
- return 0;
- } else if (u < 0x0800) {
- // U+0080 to U+07FF - two bytes
- // first of two bytes
- Traits::appendByte(dst, 0xc0 | uchar(u >> 6));
- } else {
- if (!QChar::isSurrogate(u)) {
- // U+0800 to U+FFFF (except U+D800-U+DFFF) - three bytes
- if (!Traits::allowNonCharacters && QChar::isNonCharacter(u))
- return Traits::Error;
-
- // first of three bytes
- Traits::appendByte(dst, 0xe0 | uchar(u >> 12));
- } else {
- // U+10000 to U+10FFFF - four bytes
- // need to get one extra codepoint
- if (Traits::availableUtf16(src, end) == 0)
- return Traits::EndOfString;
-
- ushort low = Traits::peekUtf16(src);
- if (!QChar::isHighSurrogate(u))
- return Traits::Error;
- if (!QChar::isLowSurrogate(low))
- return Traits::Error;
-
- Traits::advanceUtf16(src);
- uint ucs4 = QChar::surrogateToUcs4(u, low);
-
- if (!Traits::allowNonCharacters && QChar::isNonCharacter(ucs4))
- return Traits::Error;
-
- // first byte
- Traits::appendByte(dst, 0xf0 | (uchar(ucs4 >> 18) & 0xf));
-
- // second of four bytes
- Traits::appendByte(dst, 0x80 | (uchar(ucs4 >> 12) & 0x3f));
-
- // for the rest of the bytes
- u = ushort(ucs4);
- }
-
- // second to last byte
- Traits::appendByte(dst, 0x80 | (uchar(u >> 6) & 0x3f));
- }
-
- // last byte
- Traits::appendByte(dst, 0x80 | (u & 0x3f));
- return 0;
- }
-
- inline bool isContinuationByte(uchar b)
- {
- return (b & 0xc0) == 0x80;
- }
-
- /// returns the number of characters consumed (including \a b) in case of success;
- /// returns negative in case of error: Traits::Error or Traits::EndOfString
- template <typename Traits, typename OutputPtr, typename InputPtr> inline
- int fromUtf8(uchar b, OutputPtr &dst, InputPtr &src, InputPtr end)
- {
- int charsNeeded;
- uint min_uc;
- uint uc;
-
- if (!Traits::skipAsciiHandling && b < 0x80) {
- // US-ASCII
- Traits::appendUtf16(dst, b);
- return 1;
- }
-
- if (!Traits::isTrusted && Q_UNLIKELY(b <= 0xC1)) {
- // an UTF-8 first character must be at least 0xC0
- // however, all 0xC0 and 0xC1 first bytes can only produce overlong sequences
- return Traits::Error;
- } else if (b < 0xe0) {
- charsNeeded = 2;
- min_uc = 0x80;
- uc = b & 0x1f;
- } else if (b < 0xf0) {
- charsNeeded = 3;
- min_uc = 0x800;
- uc = b & 0x0f;
- } else if (b < 0xf5) {
- charsNeeded = 4;
- min_uc = 0x10000;
- uc = b & 0x07;
- } else {
- // the last Unicode character is U+10FFFF
- // it's encoded in UTF-8 as "\xF4\x8F\xBF\xBF"
- // therefore, a byte higher than 0xF4 is not the UTF-8 first byte
- return Traits::Error;
- }
-
- int bytesAvailable = Traits::availableBytes(src, end);
- if (Q_UNLIKELY(bytesAvailable < charsNeeded - 1)) {
- // it's possible that we have an error instead of just unfinished bytes
- if (bytesAvailable > 0 && !isContinuationByte(Traits::peekByte(src, 0)))
- return Traits::Error;
- if (bytesAvailable > 1 && !isContinuationByte(Traits::peekByte(src, 1)))
- return Traits::Error;
- return Traits::EndOfString;
- }
-
- // first continuation character
- b = Traits::peekByte(src, 0);
- if (!isContinuationByte(b))
- return Traits::Error;
- uc <<= 6;
- uc |= b & 0x3f;
-
- if (charsNeeded > 2) {
- // second continuation character
- b = Traits::peekByte(src, 1);
- if (!isContinuationByte(b))
- return Traits::Error;
- uc <<= 6;
- uc |= b & 0x3f;
-
- if (charsNeeded > 3) {
- // third continuation character
- b = Traits::peekByte(src, 2);
- if (!isContinuationByte(b))
- return Traits::Error;
- uc <<= 6;
- uc |= b & 0x3f;
- }
- }
-
- // we've decoded something; safety-check it
- if (!Traits::isTrusted) {
- if (uc < min_uc)
- return Traits::Error;
- if (QChar::isSurrogate(uc) || uc > QChar::LastValidCodePoint)
- return Traits::Error;
- if (!Traits::allowNonCharacters && QChar::isNonCharacter(uc))
- return Traits::Error;
- }
-
- // write the UTF-16 sequence
- if (!QChar::requiresSurrogates(uc)) {
- // UTF-8 decoded and no surrogates are required
- // detach if necessary
- Traits::appendUtf16(dst, ushort(uc));
- } else {
- // UTF-8 decoded to something that requires a surrogate pair
- Traits::appendUcs4(dst, uc);
- }
-
- Traits::advanceByte(src, charsNeeded - 1);
- return charsNeeded;
- }
-}
-
-enum DataEndianness
-{
- DetectEndianness,
- BigEndianness,
- LittleEndianness
-};
-
-struct QUtf8
-{
- static QChar *convertToUnicode(QChar *, const char *, int) noexcept;
- static QString convertToUnicode(const char *, int);
- static QString convertToUnicode(const char *, int, QTextCodec::ConverterState *);
- static QByteArray convertFromUnicode(const QChar *, int);
- static QByteArray convertFromUnicode(const QChar *, int, QTextCodec::ConverterState *);
- struct ValidUtf8Result {
- bool isValidUtf8;
- bool isValidAscii;
- };
- static ValidUtf8Result isValidUtf8(const char *, qsizetype);
- static int compareUtf8(const char *, qsizetype, const QChar *, int);
- static int compareUtf8(const char *, qsizetype, QLatin1String s);
-};
-
-struct QUtf16
-{
- static QString convertToUnicode(const char *, int, QTextCodec::ConverterState *, DataEndianness = DetectEndianness);
- static QByteArray convertFromUnicode(const QChar *, int, QTextCodec::ConverterState *, DataEndianness = DetectEndianness);
-};
-
-struct QUtf32
-{
- static QString convertToUnicode(const char *, int, QTextCodec::ConverterState *, DataEndianness = DetectEndianness);
- static QByteArray convertFromUnicode(const QChar *, int, QTextCodec::ConverterState *, DataEndianness = DetectEndianness);
-};
-
-/*
- Converts from different utf encodings looking at a possible byte order mark at the
- beginning of the string. If no BOM exists, utf-8 is assumed.
- */
-QString Q_CORE_EXPORT qFromUtfEncoded(const QByteArray &ba);
-
-
#if QT_CONFIG(textcodec)
class QUtf8Codec : public QTextCodec {
diff --git a/src/corelib/global/qconfig-bootstrapped.h b/src/corelib/global/qconfig-bootstrapped.h
index 349dfeea1c..6ef4acf503 100644
--- a/src/corelib/global/qconfig-bootstrapped.h
+++ b/src/corelib/global/qconfig-bootstrapped.h
@@ -141,18 +141,17 @@
#define QT_FEATURE_zstd -1
#endif
+#define QT_FEATURE_textcodec -1
+
#ifdef QT_BUILD_QMAKE
#define QT_FEATURE_commandlineparser -1
#define QT_NO_COMPRESS
#define QT_JSON_READONLY
#define QT_FEATURE_settings 1
#define QT_NO_STANDARDPATHS
-#define QT_FEATURE_textcodec -1
#else
-#define QT_FEATURE_codecs -1
#define QT_FEATURE_commandlineparser 1
#define QT_FEATURE_settings -1
-#define QT_FEATURE_textcodec 1
#endif
#endif // QT_BOOTSTRAPPED
diff --git a/src/corelib/io/qfilesystemiterator_unix.cpp b/src/corelib/io/qfilesystemiterator_unix.cpp
index ceea3a467c..4bc6b2e31b 100644
--- a/src/corelib/io/qfilesystemiterator_unix.cpp
+++ b/src/corelib/io/qfilesystemiterator_unix.cpp
@@ -42,7 +42,7 @@
#if QT_CONFIG(textcodec)
# include <qtextcodec.h>
-# include <private/qutfcodec_p.h>
+# include <private/qstringconverter_p.h>
#endif
#ifndef QT_NO_FILESYSTEMITERATOR
diff --git a/src/corelib/io/qurlrecode.cpp b/src/corelib/io/qurlrecode.cpp
index 1c9d0d1d4b..2788de3b3a 100644
--- a/src/corelib/io/qurlrecode.cpp
+++ b/src/corelib/io/qurlrecode.cpp
@@ -38,7 +38,7 @@
****************************************************************************/
#include "qurl.h"
-#include "private/qutfcodec_p.h"
+#include "private/qstringconverter_p.h"
#include "private/qtools_p.h"
#include "private/qsimd_p.h"
diff --git a/src/corelib/serialization/qcborstreamreader.cpp b/src/corelib/serialization/qcborstreamreader.cpp
index ec385e0629..c49a76aada 100644
--- a/src/corelib/serialization/qcborstreamreader.cpp
+++ b/src/corelib/serialization/qcborstreamreader.cpp
@@ -44,7 +44,7 @@
#include <private/qbytearray_p.h>
#include <private/qnumeric_p.h>
-#include <private/qutfcodec_p.h>
+#include <private/qstringconverter_p.h>
#include <qdebug.h>
#include <qstack.h>
diff --git a/src/corelib/serialization/qcborvalue_p.h b/src/corelib/serialization/qcborvalue_p.h
index 1d686f118b..38383c7522 100644
--- a/src/corelib/serialization/qcborvalue_p.h
+++ b/src/corelib/serialization/qcborvalue_p.h
@@ -54,7 +54,7 @@
#include "qcborvalue.h"
#include <private/qglobal_p.h>
-#include <private/qutfcodec_p.h>
+#include <private/qstringconverter_p.h>
#include <math.h>
diff --git a/src/corelib/serialization/qjsonparser.cpp b/src/corelib/serialization/qjsonparser.cpp
index 46d82ea47f..116e7f6995 100644
--- a/src/corelib/serialization/qjsonparser.cpp
+++ b/src/corelib/serialization/qjsonparser.cpp
@@ -44,7 +44,7 @@
#include <qdebug.h>
#include "qjsonparser_p.h"
#include "qjson_p.h"
-#include "private/qutfcodec_p.h"
+#include "private/qstringconverter_p.h"
#include "private/qcborvalue_p.h"
#include "private/qnumeric_p.h"
diff --git a/src/corelib/serialization/qjsonwriter.cpp b/src/corelib/serialization/qjsonwriter.cpp
index 590b59f09c..8610cdff7e 100644
--- a/src/corelib/serialization/qjsonwriter.cpp
+++ b/src/corelib/serialization/qjsonwriter.cpp
@@ -42,7 +42,7 @@
#include <qlocale.h>
#include "qjsonwriter_p.h"
#include "qjson_p.h"
-#include "private/qutfcodec_p.h"
+#include "private/qstringconverter_p.h"
#include <private/qnumeric_p.h>
#include <private/qcborvalue_p.h>
diff --git a/src/corelib/text/qstring.cpp b/src/corelib/text/qstring.cpp
index 0682395ebf..68a0f757c8 100644
--- a/src/corelib/text/qstring.cpp
+++ b/src/corelib/text/qstring.cpp
@@ -48,7 +48,7 @@
#if QT_CONFIG(textcodec)
#include <qtextcodec.h>
#endif
-#include <private/qutfcodec_p.h>
+#include <private/qstringconverter_p.h>
#include "qlocale_tools_p.h"
#include "private/qsimd_p.h"
#include <qnumeric.h>
diff --git a/src/corelib/text/qstringbuilder.cpp b/src/corelib/text/qstringbuilder.cpp
index 29bd216e80..4e47ba0922 100644
--- a/src/corelib/text/qstringbuilder.cpp
+++ b/src/corelib/text/qstringbuilder.cpp
@@ -38,7 +38,7 @@
****************************************************************************/
#include "qstringbuilder.h"
-#include <private/qutfcodec_p.h>
+#include <private/qstringconverter_p.h>
QT_BEGIN_NAMESPACE
diff --git a/src/corelib/text/qstringconverter.cpp b/src/corelib/text/qstringconverter.cpp
index 1f61eee5cb..36567f5106 100644
--- a/src/corelib/text/qstringconverter.cpp
+++ b/src/corelib/text/qstringconverter.cpp
@@ -39,10 +39,954 @@
****************************************************************************/
#include <qstringconverter.h>
-#include <private/qutfcodec_p.h>
+#include <private/qstringconverter_p.h>
+#include "qendian.h"
+
+#include "private/qsimd_p.h"
+#include "private/qstringiterator_p.h"
QT_BEGIN_NAMESPACE
+enum { Endian = 0, Data = 1 };
+
+static const uchar utf8bom[] = { 0xef, 0xbb, 0xbf };
+
+#if (defined(__SSE2__) && defined(QT_COMPILER_SUPPORTS_SSE2)) \
+ || (defined(__ARM_NEON__) && defined(Q_PROCESSOR_ARM_64))
+static Q_ALWAYS_INLINE uint qBitScanReverse(unsigned v) noexcept
+{
+ uint result = qCountLeadingZeroBits(v);
+ // Now Invert the result: clz will count *down* from the msb to the lsb, so the msb index is 31
+ // and the lsb index is 0. The result for _bit_scan_reverse is expected to be the index when
+ // counting up: msb index is 0 (because it starts there), and the lsb index is 31.
+ result ^= sizeof(unsigned) * 8 - 1;
+ return result;
+}
+#endif
+
+#if defined(__SSE2__) && defined(QT_COMPILER_SUPPORTS_SSE2)
+static inline bool simdEncodeAscii(uchar *&dst, const ushort *&nextAscii, const ushort *&src, const ushort *end)
+{
+ // do sixteen characters at a time
+ for ( ; end - src >= 16; src += 16, dst += 16) {
+# ifdef __AVX2__
+ __m256i data = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src));
+ __m128i data1 = _mm256_castsi256_si128(data);
+ __m128i data2 = _mm256_extracti128_si256(data, 1);
+# else
+ __m128i data1 = _mm_loadu_si128((const __m128i*)src);
+ __m128i data2 = _mm_loadu_si128(1+(const __m128i*)src);
+# endif
+
+ // check if everything is ASCII
+ // the highest ASCII value is U+007F
+ // Do the packing directly:
+ // The PACKUSWB instruction has packs a signed 16-bit integer to an unsigned 8-bit
+ // with saturation. That is, anything from 0x0100 to 0x7fff is saturated to 0xff,
+ // while all negatives (0x8000 to 0xffff) get saturated to 0x00. To detect non-ASCII,
+ // we simply do a signed greater-than comparison to 0x00. That means we detect NULs as
+ // "non-ASCII", but it's an acceptable compromise.
+ __m128i packed = _mm_packus_epi16(data1, data2);
+ __m128i nonAscii = _mm_cmpgt_epi8(packed, _mm_setzero_si128());
+
+ // store, even if there are non-ASCII characters here
+ _mm_storeu_si128((__m128i*)dst, packed);
+
+ // n will contain 1 bit set per character in [data1, data2] that is non-ASCII (or NUL)
+ ushort n = ~_mm_movemask_epi8(nonAscii);
+ if (n) {
+ // find the next probable ASCII character
+ // we don't want to load 32 bytes again in this loop if we know there are non-ASCII
+ // characters still coming
+ nextAscii = src + qBitScanReverse(n) + 1;
+
+ n = qCountTrailingZeroBits(n);
+ dst += n;
+ src += n;
+ return false;
+ }
+ }
+
+ if (end - src >= 8) {
+ // do eight characters at a time
+ __m128i data = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src));
+ __m128i packed = _mm_packus_epi16(data, data);
+ __m128i nonAscii = _mm_cmpgt_epi8(packed, _mm_setzero_si128());
+
+ // store even non-ASCII
+ _mm_storel_epi64(reinterpret_cast<__m128i *>(dst), packed);
+
+ uchar n = ~_mm_movemask_epi8(nonAscii);
+ if (n) {
+ nextAscii = src + qBitScanReverse(n) + 1;
+ n = qCountTrailingZeroBits(n);
+ dst += n;
+ src += n;
+ return false;
+ }
+ }
+
+ return src == end;
+}
+
+static inline bool simdDecodeAscii(ushort *&dst, const uchar *&nextAscii, const uchar *&src, const uchar *end)
+{
+ // do sixteen characters at a time
+ for ( ; end - src >= 16; src += 16, dst += 16) {
+ __m128i data = _mm_loadu_si128((const __m128i*)src);
+
+#ifdef __AVX2__
+ const int BitSpacing = 2;
+ // load and zero extend to an YMM register
+ const __m256i extended = _mm256_cvtepu8_epi16(data);
+
+ uint n = _mm256_movemask_epi8(extended);
+ if (!n) {
+ // store
+ _mm256_storeu_si256((__m256i*)dst, extended);
+ continue;
+ }
+#else
+ const int BitSpacing = 1;
+
+ // check if everything is ASCII
+ // movemask extracts the high bit of every byte, so n is non-zero if something isn't ASCII
+ uint n = _mm_movemask_epi8(data);
+ if (!n) {
+ // unpack
+ _mm_storeu_si128((__m128i*)dst, _mm_unpacklo_epi8(data, _mm_setzero_si128()));
+ _mm_storeu_si128(1+(__m128i*)dst, _mm_unpackhi_epi8(data, _mm_setzero_si128()));
+ continue;
+ }
+#endif
+
+ // copy the front part that is still ASCII
+ while (!(n & 1)) {
+ *dst++ = *src++;
+ n >>= BitSpacing;
+ }
+
+ // find the next probable ASCII character
+ // we don't want to load 16 bytes again in this loop if we know there are non-ASCII
+ // characters still coming
+ n = qBitScanReverse(n);
+ nextAscii = src + (n / BitSpacing) + 1;
+ return false;
+
+ }
+
+ if (end - src >= 8) {
+ __m128i data = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(src));
+ uint n = _mm_movemask_epi8(data) & 0xff;
+ if (!n) {
+ // unpack and store
+ _mm_storeu_si128(reinterpret_cast<__m128i *>(dst), _mm_unpacklo_epi8(data, _mm_setzero_si128()));
+ } else {
+ while (!(n & 1)) {
+ *dst++ = *src++;
+ n >>= 1;
+ }
+
+ n = qBitScanReverse(n);
+ nextAscii = src + n + 1;
+ return false;
+ }
+ }
+
+ return src == end;
+}
+
+static inline const uchar *simdFindNonAscii(const uchar *src, const uchar *end, const uchar *&nextAscii)
+{
+#ifdef __AVX2__
+ // do 32 characters at a time
+ // (this is similar to simdTestMask in qstring.cpp)
+ const __m256i mask = _mm256_set1_epi8(0x80);
+ for ( ; end - src >= 32; src += 32) {
+ __m256i data = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src));
+ if (_mm256_testz_si256(mask, data))
+ continue;
+
+ uint n = _mm256_movemask_epi8(data);
+ Q_ASSUME(n);
+
+ // find the next probable ASCII character
+ // we don't want to load 32 bytes again in this loop if we know there are non-ASCII
+ // characters still coming
+ nextAscii = src + qBitScanReverse(n) + 1;
+
+ // return the non-ASCII character
+ return src + qCountTrailingZeroBits(n);
+ }
+#endif
+
+ // do sixteen characters at a time
+ for ( ; end - src >= 16; src += 16) {
+ __m128i data = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src));
+
+ // check if everything is ASCII
+ // movemask extracts the high bit of every byte, so n is non-zero if something isn't ASCII
+ uint n = _mm_movemask_epi8(data);
+ if (!n)
+ continue;
+
+ // find the next probable ASCII character
+ // we don't want to load 16 bytes again in this loop if we know there are non-ASCII
+ // characters still coming
+ nextAscii = src + qBitScanReverse(n) + 1;
+
+ // return the non-ASCII character
+ return src + qCountTrailingZeroBits(n);
+ }
+
+ // do four characters at a time
+ for ( ; end - src >= 4; src += 4) {
+ quint32 data = qFromUnaligned<quint32>(src);
+ data &= 0x80808080U;
+ if (!data)
+ continue;
+
+ // We don't try to guess which of the three bytes is ASCII and which
+ // one isn't. The chance that at least two of them are non-ASCII is
+ // better than 75%.
+ nextAscii = src;
+ return src;
+ }
+ nextAscii = end;
+ return src;
+}
+#elif defined(__ARM_NEON__) && defined(Q_PROCESSOR_ARM_64) // vaddv is only available on Aarch64
+static inline bool simdEncodeAscii(uchar *&dst, const ushort *&nextAscii, const ushort *&src, const ushort *end)
+{
+ uint16x8_t maxAscii = vdupq_n_u16(0x7f);
+ uint16x8_t mask1 = { 1, 1 << 2, 1 << 4, 1 << 6, 1 << 8, 1 << 10, 1 << 12, 1 << 14 };
+ uint16x8_t mask2 = vshlq_n_u16(mask1, 1);
+
+ // do sixteen characters at a time
+ for ( ; end - src >= 16; src += 16, dst += 16) {
+ // load 2 lanes (or: "load interleaved")
+ uint16x8x2_t in = vld2q_u16(src);
+
+ // check if any of the elements > 0x7f, select 1 bit per element (element 0 -> bit 0, element 1 -> bit 1, etc),
+ // add those together into a scalar, and merge the scalars.
+ uint16_t nonAscii = vaddvq_u16(vandq_u16(vcgtq_u16(in.val[0], maxAscii), mask1))
+ | vaddvq_u16(vandq_u16(vcgtq_u16(in.val[1], maxAscii), mask2));
+
+ // merge the two lanes by shifting the values of the second by 8 and inserting them
+ uint16x8_t out = vsliq_n_u16(in.val[0], in.val[1], 8);
+
+ // store, even if there are non-ASCII characters here
+ vst1q_u8(dst, vreinterpretq_u8_u16(out));
+
+ if (nonAscii) {
+ // find the next probable ASCII character
+ // we don't want to load 32 bytes again in this loop if we know there are non-ASCII
+ // characters still coming
+ nextAscii = src + qBitScanReverse(nonAscii) + 1;
+
+ nonAscii = qCountTrailingZeroBits(nonAscii);
+ dst += nonAscii;
+ src += nonAscii;
+ return false;
+ }
+ }
+ return src == end;
+}
+
+static inline bool simdDecodeAscii(ushort *&dst, const uchar *&nextAscii, const uchar *&src, const uchar *end)
+{
+ // do eight characters at a time
+ uint8x8_t msb_mask = vdup_n_u8(0x80);
+ uint8x8_t add_mask = { 1, 1 << 1, 1 << 2, 1 << 3, 1 << 4, 1 << 5, 1 << 6, 1 << 7 };
+ for ( ; end - src >= 8; src += 8, dst += 8) {
+ uint8x8_t c = vld1_u8(src);
+ uint8_t n = vaddv_u8(vand_u8(vcge_u8(c, msb_mask), add_mask));
+ if (!n) {
+ // store
+ vst1q_u16(dst, vmovl_u8(c));
+ continue;
+ }
+
+ // copy the front part that is still ASCII
+ while (!(n & 1)) {
+ *dst++ = *src++;
+ n >>= 1;
+ }
+
+ // find the next probable ASCII character
+ // we don't want to load 16 bytes again in this loop if we know there are non-ASCII
+ // characters still coming
+ n = qBitScanReverse(n);
+ nextAscii = src + n + 1;
+ return false;
+
+ }
+ return src == end;
+}
+
+static inline const uchar *simdFindNonAscii(const uchar *src, const uchar *end, const uchar *&nextAscii)
+{
+ // The SIMD code below is untested, so just force an early return until
+ // we've had the time to verify it works.
+ nextAscii = end;
+ return src;
+
+ // do eight characters at a time
+ uint8x8_t msb_mask = vdup_n_u8(0x80);
+ uint8x8_t add_mask = { 1, 1 << 1, 1 << 2, 1 << 3, 1 << 4, 1 << 5, 1 << 6, 1 << 7 };
+ for ( ; end - src >= 8; src += 8) {
+ uint8x8_t c = vld1_u8(src);
+ uint8_t n = vaddv_u8(vand_u8(vcge_u8(c, msb_mask), add_mask));
+ if (!n)
+ continue;
+
+ // find the next probable ASCII character
+ // we don't want to load 16 bytes again in this loop if we know there are non-ASCII
+ // characters still coming
+ nextAscii = src + qBitScanReverse(n) + 1;
+
+ // return the non-ASCII character
+ return src + qCountTrailingZeroBits(n);
+ }
+ nextAscii = end;
+ return src;
+}
+#else
+static inline bool simdEncodeAscii(uchar *, const ushort *, const ushort *, const ushort *)
+{
+ return false;
+}
+
+static inline bool simdDecodeAscii(ushort *, const uchar *, const uchar *, const uchar *)
+{
+ return false;
+}
+
+static inline const uchar *simdFindNonAscii(const uchar *src, const uchar *end, const uchar *&nextAscii)
+{
+ nextAscii = end;
+ return src;
+}
+#endif
+
+QByteArray QUtf8::convertFromUnicode(const QChar *uc, qsizetype len)
+{
+ // create a QByteArray with the worst case scenario size
+ QByteArray result(len * 3, Qt::Uninitialized);
+ uchar *dst = reinterpret_cast<uchar *>(const_cast<char *>(result.constData()));
+ const ushort *src = reinterpret_cast<const ushort *>(uc);
+ const ushort *const end = src + len;
+
+ while (src != end) {
+ const ushort *nextAscii = end;
+ if (simdEncodeAscii(dst, nextAscii, src, end))
+ break;
+
+ do {
+ ushort uc = *src++;
+ int res = QUtf8Functions::toUtf8<QUtf8BaseTraits>(uc, dst, src, end);
+ if (res < 0) {
+ // encoding error - append '?'
+ *dst++ = '?';
+ }
+ } while (src < nextAscii);
+ }
+
+ result.truncate(dst - reinterpret_cast<uchar *>(const_cast<char *>(result.constData())));
+ return result;
+}
+
+QByteArray QUtf8::convertFromUnicode(const QChar *uc, qsizetype len, QStringConverter::State *state)
+{
+ uchar replacement = '?';
+ qsizetype rlen = 3*len;
+ int surrogate_high = -1;
+ if (state) {
+ if (state->flags & QStringConverter::ConvertInvalidToNull)
+ replacement = 0;
+ if (!(state->flags & QStringConverter::IgnoreHeader))
+ rlen += 3;
+ if (state->remainingChars)
+ surrogate_high = state->state_data[0];
+ }
+
+
+ QByteArray rstr(rlen, Qt::Uninitialized);
+ uchar *cursor = reinterpret_cast<uchar *>(const_cast<char *>(rstr.constData()));
+ const ushort *src = reinterpret_cast<const ushort *>(uc);
+ const ushort *const end = src + len;
+
+ int invalid = 0;
+ if (state && !(state->flags & QStringConverter::IgnoreHeader)) {
+ // append UTF-8 BOM
+ *cursor++ = utf8bom[0];
+ *cursor++ = utf8bom[1];
+ *cursor++ = utf8bom[2];
+ }
+
+ const ushort *nextAscii = src;
+ while (src != end) {
+ int res;
+ ushort uc;
+ if (surrogate_high != -1) {
+ uc = surrogate_high;
+ surrogate_high = -1;
+ res = QUtf8Functions::toUtf8<QUtf8BaseTraits>(uc, cursor, src, end);
+ } else {
+ if (src >= nextAscii && simdEncodeAscii(cursor, nextAscii, src, end))
+ break;
+
+ uc = *src++;
+ res = QUtf8Functions::toUtf8<QUtf8BaseTraits>(uc, cursor, src, end);
+ }
+ if (Q_LIKELY(res >= 0))
+ continue;
+
+ if (res == QUtf8BaseTraits::Error) {
+ // encoding error
+ ++invalid;
+ *cursor++ = replacement;
+ } else if (res == QUtf8BaseTraits::EndOfString) {
+ surrogate_high = uc;
+ break;
+ }
+ }
+
+ rstr.resize(cursor - (const uchar*)rstr.constData());
+ if (state) {
+ state->invalidChars += invalid;
+ state->flags |= QStringConverter::IgnoreHeader;
+ state->remainingChars = 0;
+ if (surrogate_high >= 0) {
+ state->remainingChars = 1;
+ state->state_data[0] = surrogate_high;
+ }
+ }
+ return rstr;
+}
+
+QString QUtf8::convertToUnicode(const char *chars, qsizetype len)
+{
+ // UTF-8 to UTF-16 always needs the exact same number of words or less:
+ // UTF-8 UTF-16
+ // 1 byte 1 word
+ // 2 bytes 1 word
+ // 3 bytes 1 word
+ // 4 bytes 2 words (one surrogate pair)
+ // That is, we'll use the full buffer if the input is US-ASCII (1-byte UTF-8),
+ // half the buffer for U+0080-U+07FF text (e.g., Greek, Cyrillic, Arabic) or
+ // non-BMP text, and one third of the buffer for U+0800-U+FFFF text (e.g, CJK).
+ //
+ // The table holds for invalid sequences too: we'll insert one replacement char
+ // per invalid byte.
+ QString result(len, Qt::Uninitialized);
+ QChar *data = const_cast<QChar*>(result.constData()); // we know we're not shared
+ const QChar *end = convertToUnicode(data, chars, len);
+ result.truncate(end - data);
+ return result;
+}
+
+/*!
+ \since 5.7
+ \overload
+
+ Converts the UTF-8 sequence of \a len octets beginning at \a chars to
+ a sequence of QChar starting at \a buffer. The buffer is expected to be
+ large enough to hold the result. An upper bound for the size of the
+ buffer is \a len QChars.
+
+ If, during decoding, an error occurs, a QChar::ReplacementCharacter is
+ written.
+
+ Returns a pointer to one past the last QChar written.
+
+ This function never throws.
+*/
+
+QChar *QUtf8::convertToUnicode(QChar *buffer, const char *chars, qsizetype len) noexcept
+{
+ ushort *dst = reinterpret_cast<ushort *>(buffer);
+ const uchar *src = reinterpret_cast<const uchar *>(chars);
+ const uchar *end = src + len;
+
+ // attempt to do a full decoding in SIMD
+ const uchar *nextAscii = end;
+ if (!simdDecodeAscii(dst, nextAscii, src, end)) {
+ // at least one non-ASCII entry
+ // check if we failed to decode the UTF-8 BOM; if so, skip it
+ if (Q_UNLIKELY(src == reinterpret_cast<const uchar *>(chars))
+ && end - src >= 3
+ && Q_UNLIKELY(src[0] == utf8bom[0] && src[1] == utf8bom[1] && src[2] == utf8bom[2])) {
+ src += 3;
+ }
+
+ while (src < end) {
+ nextAscii = end;
+ if (simdDecodeAscii(dst, nextAscii, src, end))
+ break;
+
+ do {
+ uchar b = *src++;
+ int res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(b, dst, src, end);
+ if (res < 0) {
+ // decoding error
+ *dst++ = QChar::ReplacementCharacter;
+ }
+ } while (src < nextAscii);
+ }
+ }
+
+ return reinterpret_cast<QChar *>(dst);
+}
+
+QString QUtf8::convertToUnicode(const char *chars, qsizetype len, QStringConverter::State *state)
+{
+ bool headerdone = false;
+ ushort replacement = QChar::ReplacementCharacter;
+ int invalid = 0;
+ int res;
+ uchar ch = 0;
+
+ // See above for buffer requirements for stateless decoding. However, that
+ // fails if the state is not empty. The following situations can add to the
+ // requirements:
+ // state contains chars starts with requirement
+ // 1 of 2 bytes valid continuation 0
+ // 2 of 3 bytes same 0
+ // 3 bytes of 4 same +1 (need to insert surrogate pair)
+ // 1 of 2 bytes invalid continuation +1 (need to insert replacement and restart)
+ // 2 of 3 bytes same +1 (same)
+ // 3 of 4 bytes same +1 (same)
+ QString result(len + 1, Qt::Uninitialized);
+
+ ushort *dst = reinterpret_cast<ushort *>(const_cast<QChar *>(result.constData()));
+ const uchar *src = reinterpret_cast<const uchar *>(chars);
+ const uchar *end = src + len;
+
+ if (state) {
+ if (state->flags & QStringConverter::IgnoreHeader)
+ headerdone = true;
+ if (state->flags & QStringConverter::ConvertInvalidToNull)
+ replacement = QChar::Null;
+ if (state->remainingChars) {
+ // handle incoming state first
+ uchar remainingCharsData[4]; // longest UTF-8 sequence possible
+ qsizetype remainingCharsCount = state->remainingChars;
+ qsizetype newCharsToCopy = qMin<int>(sizeof(remainingCharsData) - remainingCharsCount, end - src);
+
+ memset(remainingCharsData, 0, sizeof(remainingCharsData));
+ memcpy(remainingCharsData, &state->state_data[0], remainingCharsCount);
+ memcpy(remainingCharsData + remainingCharsCount, src, newCharsToCopy);
+
+ const uchar *begin = &remainingCharsData[1];
+ res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(remainingCharsData[0], dst, begin,
+ static_cast<const uchar *>(remainingCharsData) + remainingCharsCount + newCharsToCopy);
+ if (res == QUtf8BaseTraits::Error || (res == QUtf8BaseTraits::EndOfString && len == 0)) {
+ // special case for len == 0:
+ // if we were supplied an empty string, terminate the previous, unfinished sequence with error
+ ++invalid;
+ *dst++ = replacement;
+ } else if (res == QUtf8BaseTraits::EndOfString) {
+ // if we got EndOfString again, then there were too few bytes in src;
+ // copy to our state and return
+ state->remainingChars = remainingCharsCount + newCharsToCopy;
+ memcpy(&state->state_data[0], remainingCharsData, state->remainingChars);
+ return QString();
+ } else if (!headerdone && res >= 0) {
+ // eat the UTF-8 BOM
+ headerdone = true;
+ if (dst[-1] == 0xfeff)
+ --dst;
+ }
+
+ // adjust src now that we have maybe consumed a few chars
+ if (res >= 0) {
+ Q_ASSERT(res > remainingCharsCount);
+ src += res - remainingCharsCount;
+ }
+ }
+ }
+
+ // main body, stateless decoding
+ res = 0;
+ const uchar *nextAscii = src;
+ const uchar *start = src;
+ while (res >= 0 && src < end) {
+ if (src >= nextAscii && simdDecodeAscii(dst, nextAscii, src, end))
+ break;
+
+ ch = *src++;
+ res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(ch, dst, src, end);
+ if (!headerdone && res >= 0) {
+ headerdone = true;
+ if (src == start + 3) { // 3 == sizeof(utf8-bom)
+ // eat the UTF-8 BOM (it can only appear at the beginning of the string).
+ if (dst[-1] == 0xfeff)
+ --dst;
+ }
+ }
+ if (res == QUtf8BaseTraits::Error) {
+ res = 0;
+ ++invalid;
+ *dst++ = replacement;
+ }
+ }
+
+ if (!state && res == QUtf8BaseTraits::EndOfString) {
+ // unterminated UTF sequence
+ *dst++ = QChar::ReplacementCharacter;
+ while (src++ < end)
+ *dst++ = QChar::ReplacementCharacter;
+ }
+
+ result.truncate(dst - (const ushort *)result.unicode());
+ if (state) {
+ state->invalidChars += invalid;
+ if (headerdone)
+ state->flags |= QStringConverter::IgnoreHeader;
+ if (res == QUtf8BaseTraits::EndOfString) {
+ --src; // unread the byte in ch
+ state->remainingChars = end - src;
+ memcpy(&state->state_data[0], src, end - src);
+ } else {
+ state->remainingChars = 0;
+ }
+ }
+ return result;
+}
+
+struct QUtf8NoOutputTraits : public QUtf8BaseTraitsNoAscii
+{
+ struct NoOutput {};
+ static void appendUtf16(const NoOutput &, ushort) {}
+ static void appendUcs4(const NoOutput &, uint) {}
+};
+
+QUtf8::ValidUtf8Result QUtf8::isValidUtf8(const char *chars, qsizetype len)
+{
+ const uchar *src = reinterpret_cast<const uchar *>(chars);
+ const uchar *end = src + len;
+ const uchar *nextAscii = src;
+ bool isValidAscii = true;
+
+ while (src < end) {
+ if (src >= nextAscii)
+ src = simdFindNonAscii(src, end, nextAscii);
+ if (src == end)
+ break;
+
+ do {
+ uchar b = *src++;
+ if ((b & 0x80) == 0)
+ continue;
+
+ isValidAscii = false;
+ QUtf8NoOutputTraits::NoOutput output;
+ int res = QUtf8Functions::fromUtf8<QUtf8NoOutputTraits>(b, output, src, end);
+ if (res < 0) {
+ // decoding error
+ return { false, false };
+ }
+ } while (src < nextAscii);
+ }
+
+ return { true, isValidAscii };
+}
+
+int QUtf8::compareUtf8(const char *utf8, qsizetype u8len, const QChar *utf16, qsizetype u16len)
+{
+ uint uc1, uc2;
+ auto src1 = reinterpret_cast<const uchar *>(utf8);
+ auto end1 = src1 + u8len;
+ QStringIterator src2(utf16, utf16 + u16len);
+
+ while (src1 < end1 && src2.hasNext()) {
+ uchar b = *src1++;
+ uint *output = &uc1;
+ int res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(b, output, src1, end1);
+ if (res < 0) {
+ // decoding error
+ uc1 = QChar::ReplacementCharacter;
+ }
+
+ uc2 = src2.next();
+ if (uc1 != uc2)
+ return int(uc1) - int(uc2);
+ }
+
+ // the shorter string sorts first
+ return (end1 > src1) - int(src2.hasNext());
+}
+
+int QUtf8::compareUtf8(const char *utf8, qsizetype u8len, QLatin1String s)
+{
+ uint uc1;
+ auto src1 = reinterpret_cast<const uchar *>(utf8);
+ auto end1 = src1 + u8len;
+ auto src2 = reinterpret_cast<const uchar *>(s.latin1());
+ auto end2 = src2 + s.size();
+
+ while (src1 < end1 && src2 < end2) {
+ uchar b = *src1++;
+ uint *output = &uc1;
+ int res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(b, output, src1, end1);
+ if (res < 0) {
+ // decoding error
+ uc1 = QChar::ReplacementCharacter;
+ }
+
+ uint uc2 = *src2++;
+ if (uc1 != uc2)
+ return int(uc1) - int(uc2);
+ }
+
+ // the shorter string sorts first
+ return (end1 > src1) - (end2 > src2);
+}
+
+QByteArray QUtf16::convertFromUnicode(const QChar *uc, qsizetype len, QStringConverter::State *state, DataEndianness e)
+{
+ DataEndianness endian = e;
+ qsizetype length = 2*len;
+ if (!state || (!(state->flags & QStringConverter::IgnoreHeader))) {
+ length += 2;
+ }
+ if (e == DetectEndianness) {
+ endian = (QSysInfo::ByteOrder == QSysInfo::BigEndian) ? BigEndianness : LittleEndianness;
+ }
+
+ QByteArray d;
+ d.resize(length);
+ char *data = d.data();
+ if (!state || !(state->flags & QStringConverter::IgnoreHeader)) {
+ QChar bom(QChar::ByteOrderMark);
+ if (endian == BigEndianness)
+ qToBigEndian(bom.unicode(), data);
+ else
+ qToLittleEndian(bom.unicode(), data);
+ data += 2;
+ }
+ if (endian == BigEndianness)
+ qToBigEndian<ushort>(uc, len, data);
+ else
+ qToLittleEndian<ushort>(uc, len, data);
+
+ if (state) {
+ state->remainingChars = 0;
+ state->flags |= QStringConverter::IgnoreHeader;
+ }
+ return d;
+}
+
+QString QUtf16::convertToUnicode(const char *chars, qsizetype len, QStringConverter::State *state, DataEndianness e)
+{
+ DataEndianness endian = e;
+ bool half = false;
+ uchar buf = 0;
+ bool headerdone = false;
+ if (state) {
+ headerdone = state->flags & QStringConverter::IgnoreHeader;
+ if (endian == DetectEndianness)
+ endian = (DataEndianness)state->state_data[Endian];
+ if (state->remainingChars) {
+ half = true;
+ buf = state->state_data[Data];
+ }
+ }
+ if (headerdone && endian == DetectEndianness)
+ endian = (QSysInfo::ByteOrder == QSysInfo::BigEndian) ? BigEndianness : LittleEndianness;
+
+ QString result(len, Qt::Uninitialized); // worst case
+ QChar *qch = (QChar *)result.data();
+ while (len--) {
+ if (half) {
+ QChar ch;
+ if (endian == LittleEndianness) {
+ ch.setRow(*chars++);
+ ch.setCell(buf);
+ } else {
+ ch.setRow(buf);
+ ch.setCell(*chars++);
+ }
+ if (!headerdone) {
+ headerdone = true;
+ if (endian == DetectEndianness) {
+ if (ch == QChar::ByteOrderSwapped) {
+ endian = LittleEndianness;
+ } else if (ch == QChar::ByteOrderMark) {
+ endian = BigEndianness;
+ } else {
+ if (QSysInfo::ByteOrder == QSysInfo::BigEndian) {
+ endian = BigEndianness;
+ } else {
+ endian = LittleEndianness;
+ ch = QChar::fromUcs2((ch.unicode() >> 8) | ((ch.unicode() & 0xff) << 8));
+ }
+ *qch++ = ch;
+ }
+ } else if (ch != QChar::ByteOrderMark) {
+ *qch++ = ch;
+ }
+ } else {
+ *qch++ = ch;
+ }
+ half = false;
+ } else {
+ buf = *chars++;
+ half = true;
+ }
+ }
+ result.truncate(qch - result.unicode());
+
+ if (state) {
+ if (headerdone)
+ state->flags |= QStringConverter::IgnoreHeader;
+ state->state_data[Endian] = endian;
+ if (half) {
+ state->remainingChars = 1;
+ state->state_data[Data] = buf;
+ } else {
+ state->remainingChars = 0;
+ state->state_data[Data] = 0;
+ }
+ }
+ return result;
+}
+
+QByteArray QUtf32::convertFromUnicode(const QChar *uc, qsizetype len, QStringConverter::State *state, DataEndianness e)
+{
+ DataEndianness endian = e;
+ qsizetype length = 4*len;
+ if (!state || (!(state->flags & QStringConverter::IgnoreHeader))) {
+ length += 4;
+ }
+ if (e == DetectEndianness) {
+ endian = (QSysInfo::ByteOrder == QSysInfo::BigEndian) ? BigEndianness : LittleEndianness;
+ }
+
+ QByteArray d(length, Qt::Uninitialized);
+ char *data = d.data();
+ if (!state || !(state->flags & QStringConverter::IgnoreHeader)) {
+ if (endian == BigEndianness) {
+ data[0] = 0;
+ data[1] = 0;
+ data[2] = (char)0xfe;
+ data[3] = (char)0xff;
+ } else {
+ data[0] = (char)0xff;
+ data[1] = (char)0xfe;
+ data[2] = 0;
+ data[3] = 0;
+ }
+ data += 4;
+ }
+
+ QStringIterator i(uc, uc + len);
+ if (endian == BigEndianness) {
+ while (i.hasNext()) {
+ uint cp = i.next();
+ qToBigEndian(cp, data);
+ data += 4;
+ }
+ } else {
+ while (i.hasNext()) {
+ uint cp = i.next();
+ qToLittleEndian(cp, data);
+ data += 4;
+ }
+ }
+
+ if (state) {
+ state->remainingChars = 0;
+ state->flags |= QStringConverter::IgnoreHeader;
+ }
+ return d;
+}
+
+QString QUtf32::convertToUnicode(const char *chars, qsizetype len, QStringConverter::State *state, DataEndianness e)
+{
+ DataEndianness endian = e;
+ uchar tuple[4];
+ int num = 0;
+ bool headerdone = false;
+ if (state) {
+ headerdone = state->flags & QStringConverter::IgnoreHeader;
+ if (endian == DetectEndianness) {
+ endian = (DataEndianness)state->state_data[Endian];
+ }
+ num = state->remainingChars;
+ memcpy(tuple, &state->state_data[Data], 4);
+ }
+ if (headerdone && endian == DetectEndianness)
+ endian = (QSysInfo::ByteOrder == QSysInfo::BigEndian) ? BigEndianness : LittleEndianness;
+
+ QString result;
+ result.resize((num + len) >> 2 << 1); // worst case
+ QChar *qch = (QChar *)result.data();
+
+ const char *end = chars + len;
+ while (chars < end) {
+ tuple[num++] = *chars++;
+ if (num == 4) {
+ if (!headerdone) {
+ headerdone = true;
+ if (endian == DetectEndianness) {
+ if (tuple[0] == 0xff && tuple[1] == 0xfe && tuple[2] == 0 && tuple[3] == 0 && endian != BigEndianness) {
+ endian = LittleEndianness;
+ num = 0;
+ continue;
+ } else if (tuple[0] == 0 && tuple[1] == 0 && tuple[2] == 0xfe && tuple[3] == 0xff && endian != LittleEndianness) {
+ endian = BigEndianness;
+ num = 0;
+ continue;
+ } else if (QSysInfo::ByteOrder == QSysInfo::BigEndian) {
+ endian = BigEndianness;
+ } else {
+ endian = LittleEndianness;
+ }
+ } else if (((endian == BigEndianness) ? qFromBigEndian<quint32>(tuple) : qFromLittleEndian<quint32>(tuple)) == QChar::ByteOrderMark) {
+ num = 0;
+ continue;
+ }
+ }
+ uint code = (endian == BigEndianness) ? qFromBigEndian<quint32>(tuple) : qFromLittleEndian<quint32>(tuple);
+ for (char16_t c : QChar::fromUcs4(code))
+ *qch++ = c;
+ num = 0;
+ }
+ }
+ result.truncate(qch - result.unicode());
+
+ if (state) {
+ if (headerdone)
+ state->flags |= QStringConverter::IgnoreHeader;
+ state->state_data[Endian] = endian;
+ state->remainingChars = num;
+ memcpy(&state->state_data[Data], tuple, 4);
+ }
+ return result;
+}
+
+QString qFromUtfEncoded(const QByteArray &ba)
+{
+ const qsizetype arraySize = ba.size();
+ const uchar *buf = reinterpret_cast<const uchar *>(ba.constData());
+ const uint bom = 0xfeff;
+
+ if (arraySize > 3) {
+ uint uc = qFromUnaligned<uint>(buf);
+ if (uc == qToBigEndian(bom) || uc == qToLittleEndian(bom))
+ return QUtf32::convertToUnicode(ba.constData(), ba.length(), nullptr); // utf-32
+ }
+
+ if (arraySize > 1) {
+ ushort uc = qFromUnaligned<ushort>(buf);
+ if (uc == qToBigEndian(ushort(bom)) || qToLittleEndian(ushort(bom)))
+ return QUtf16::convertToUnicode(ba.constData(), ba.length(), nullptr); // utf-16
+ }
+ return QUtf8::convertToUnicode(ba.constData(), ba.length());
+}
+
/*!
\enum QStringConverter::Flag
@@ -60,7 +1004,8 @@ void QStringConverter::State::clear()
{
if (clearFn)
clearFn(this);
- state_data[0] = state_data[1] = state_data[2] = state_data[3] = 0;
+ else
+ state_data[0] = state_data[1] = state_data[2] = state_data[3] = 0;
remainingChars = 0;
invalidChars = 0;
}
diff --git a/src/corelib/text/qstringconverter_p.h b/src/corelib/text/qstringconverter_p.h
new file mode 100644
index 0000000000..5764979542
--- /dev/null
+++ b/src/corelib/text/qstringconverter_p.h
@@ -0,0 +1,323 @@
+/****************************************************************************
+**
+** Copyright (C) 2018 The Qt Company Ltd.
+** Copyright (C) 2018 Intel Corporation.
+** Contact: https://www.qt.io/licensing/
+**
+** This file is part of the QtCore module of the Qt Toolkit.
+**
+** $QT_BEGIN_LICENSE:LGPL$
+** Commercial License Usage
+** Licensees holding valid commercial Qt licenses may use this file in
+** accordance with the commercial license agreement provided with the
+** Software or, alternatively, in accordance with the terms contained in
+** a written agreement between you and The Qt Company. For licensing terms
+** and conditions see https://www.qt.io/terms-conditions. For further
+** information use the contact form at https://www.qt.io/contact-us.
+**
+** GNU Lesser General Public License Usage
+** Alternatively, this file may be used under the terms of the GNU Lesser
+** General Public License version 3 as published by the Free Software
+** Foundation and appearing in the file LICENSE.LGPL3 included in the
+** packaging of this file. Please review the following information to
+** ensure the GNU Lesser General Public License version 3 requirements
+** will be met: https://www.gnu.org/licenses/lgpl-3.0.html.
+**
+** GNU General Public License Usage
+** Alternatively, this file may be used under the terms of the GNU
+** General Public License version 2.0 or (at your option) the GNU General
+** Public license version 3 or any later version approved by the KDE Free
+** Qt Foundation. The licenses are as published by the Free Software
+** Foundation and appearing in the file LICENSE.GPL2 and LICENSE.GPL3
+** included in the packaging of this file. Please review the following
+** information to ensure the GNU General Public License requirements will
+** be met: https://www.gnu.org/licenses/gpl-2.0.html and
+** https://www.gnu.org/licenses/gpl-3.0.html.
+**
+** $QT_END_LICENSE$
+**
+****************************************************************************/
+
+#ifndef QSTRINGCONVERTER_P_H
+#define QSTRINGCONVERTER_P_H
+
+//
+// W A R N I N G
+// -------------
+//
+// This file is not part of the Qt API. It exists purely as an
+// implementation detail. This header file may change from version to
+// version without notice, or even be removed.
+//
+// We mean it.
+//
+
+#include <QtCore/qstring.h>
+#include <QtCore/qendian.h>
+#include <QtCore/qstringconverter.h>
+
+QT_BEGIN_NAMESPACE
+
+struct QUtf8BaseTraits
+{
+ static const bool isTrusted = false;
+ static const bool allowNonCharacters = true;
+ static const bool skipAsciiHandling = false;
+ static const int Error = -1;
+ static const int EndOfString = -2;
+
+ static bool isValidCharacter(uint u)
+ { return int(u) >= 0; }
+
+ static void appendByte(uchar *&ptr, uchar b)
+ { *ptr++ = b; }
+
+ static uchar peekByte(const uchar *ptr, int n = 0)
+ { return ptr[n]; }
+
+ static qptrdiff availableBytes(const uchar *ptr, const uchar *end)
+ { return end - ptr; }
+
+ static void advanceByte(const uchar *&ptr, int n = 1)
+ { ptr += n; }
+
+ static void appendUtf16(ushort *&ptr, ushort uc)
+ { *ptr++ = uc; }
+
+ static void appendUcs4(ushort *&ptr, uint uc)
+ {
+ appendUtf16(ptr, QChar::highSurrogate(uc));
+ appendUtf16(ptr, QChar::lowSurrogate(uc));
+ }
+
+ static ushort peekUtf16(const ushort *ptr, int n = 0)
+ { return ptr[n]; }
+
+ static qptrdiff availableUtf16(const ushort *ptr, const ushort *end)
+ { return end - ptr; }
+
+ static void advanceUtf16(const ushort *&ptr, int n = 1)
+ { ptr += n; }
+
+ // it's possible to output to UCS-4 too
+ static void appendUtf16(uint *&ptr, ushort uc)
+ { *ptr++ = uc; }
+
+ static void appendUcs4(uint *&ptr, uint uc)
+ { *ptr++ = uc; }
+};
+
+struct QUtf8BaseTraitsNoAscii : public QUtf8BaseTraits
+{
+ static const bool skipAsciiHandling = true;
+};
+
+namespace QUtf8Functions
+{
+ /// returns 0 on success; errors can only happen if \a u is a surrogate:
+ /// Error if \a u is a low surrogate;
+ /// if \a u is a high surrogate, Error if the next isn't a low one,
+ /// EndOfString if we run into the end of the string.
+ template <typename Traits, typename OutputPtr, typename InputPtr> inline
+ int toUtf8(ushort u, OutputPtr &dst, InputPtr &src, InputPtr end)
+ {
+ if (!Traits::skipAsciiHandling && u < 0x80) {
+ // U+0000 to U+007F (US-ASCII) - one byte
+ Traits::appendByte(dst, uchar(u));
+ return 0;
+ } else if (u < 0x0800) {
+ // U+0080 to U+07FF - two bytes
+ // first of two bytes
+ Traits::appendByte(dst, 0xc0 | uchar(u >> 6));
+ } else {
+ if (!QChar::isSurrogate(u)) {
+ // U+0800 to U+FFFF (except U+D800-U+DFFF) - three bytes
+ if (!Traits::allowNonCharacters && QChar::isNonCharacter(u))
+ return Traits::Error;
+
+ // first of three bytes
+ Traits::appendByte(dst, 0xe0 | uchar(u >> 12));
+ } else {
+ // U+10000 to U+10FFFF - four bytes
+ // need to get one extra codepoint
+ if (Traits::availableUtf16(src, end) == 0)
+ return Traits::EndOfString;
+
+ ushort low = Traits::peekUtf16(src);
+ if (!QChar::isHighSurrogate(u))
+ return Traits::Error;
+ if (!QChar::isLowSurrogate(low))
+ return Traits::Error;
+
+ Traits::advanceUtf16(src);
+ uint ucs4 = QChar::surrogateToUcs4(u, low);
+
+ if (!Traits::allowNonCharacters && QChar::isNonCharacter(ucs4))
+ return Traits::Error;
+
+ // first byte
+ Traits::appendByte(dst, 0xf0 | (uchar(ucs4 >> 18) & 0xf));
+
+ // second of four bytes
+ Traits::appendByte(dst, 0x80 | (uchar(ucs4 >> 12) & 0x3f));
+
+ // for the rest of the bytes
+ u = ushort(ucs4);
+ }
+
+ // second to last byte
+ Traits::appendByte(dst, 0x80 | (uchar(u >> 6) & 0x3f));
+ }
+
+ // last byte
+ Traits::appendByte(dst, 0x80 | (u & 0x3f));
+ return 0;
+ }
+
+ inline bool isContinuationByte(uchar b)
+ {
+ return (b & 0xc0) == 0x80;
+ }
+
+ /// returns the number of characters consumed (including \a b) in case of success;
+ /// returns negative in case of error: Traits::Error or Traits::EndOfString
+ template <typename Traits, typename OutputPtr, typename InputPtr> inline
+ int fromUtf8(uchar b, OutputPtr &dst, InputPtr &src, InputPtr end)
+ {
+ int charsNeeded;
+ uint min_uc;
+ uint uc;
+
+ if (!Traits::skipAsciiHandling && b < 0x80) {
+ // US-ASCII
+ Traits::appendUtf16(dst, b);
+ return 1;
+ }
+
+ if (!Traits::isTrusted && Q_UNLIKELY(b <= 0xC1)) {
+ // an UTF-8 first character must be at least 0xC0
+ // however, all 0xC0 and 0xC1 first bytes can only produce overlong sequences
+ return Traits::Error;
+ } else if (b < 0xe0) {
+ charsNeeded = 2;
+ min_uc = 0x80;
+ uc = b & 0x1f;
+ } else if (b < 0xf0) {
+ charsNeeded = 3;
+ min_uc = 0x800;
+ uc = b & 0x0f;
+ } else if (b < 0xf5) {
+ charsNeeded = 4;
+ min_uc = 0x10000;
+ uc = b & 0x07;
+ } else {
+ // the last Unicode character is U+10FFFF
+ // it's encoded in UTF-8 as "\xF4\x8F\xBF\xBF"
+ // therefore, a byte higher than 0xF4 is not the UTF-8 first byte
+ return Traits::Error;
+ }
+
+ int bytesAvailable = Traits::availableBytes(src, end);
+ if (Q_UNLIKELY(bytesAvailable < charsNeeded - 1)) {
+ // it's possible that we have an error instead of just unfinished bytes
+ if (bytesAvailable > 0 && !isContinuationByte(Traits::peekByte(src, 0)))
+ return Traits::Error;
+ if (bytesAvailable > 1 && !isContinuationByte(Traits::peekByte(src, 1)))
+ return Traits::Error;
+ return Traits::EndOfString;
+ }
+
+ // first continuation character
+ b = Traits::peekByte(src, 0);
+ if (!isContinuationByte(b))
+ return Traits::Error;
+ uc <<= 6;
+ uc |= b & 0x3f;
+
+ if (charsNeeded > 2) {
+ // second continuation character
+ b = Traits::peekByte(src, 1);
+ if (!isContinuationByte(b))
+ return Traits::Error;
+ uc <<= 6;
+ uc |= b & 0x3f;
+
+ if (charsNeeded > 3) {
+ // third continuation character
+ b = Traits::peekByte(src, 2);
+ if (!isContinuationByte(b))
+ return Traits::Error;
+ uc <<= 6;
+ uc |= b & 0x3f;
+ }
+ }
+
+ // we've decoded something; safety-check it
+ if (!Traits::isTrusted) {
+ if (uc < min_uc)
+ return Traits::Error;
+ if (QChar::isSurrogate(uc) || uc > QChar::LastValidCodePoint)
+ return Traits::Error;
+ if (!Traits::allowNonCharacters && QChar::isNonCharacter(uc))
+ return Traits::Error;
+ }
+
+ // write the UTF-16 sequence
+ if (!QChar::requiresSurrogates(uc)) {
+ // UTF-8 decoded and no surrogates are required
+ // detach if necessary
+ Traits::appendUtf16(dst, ushort(uc));
+ } else {
+ // UTF-8 decoded to something that requires a surrogate pair
+ Traits::appendUcs4(dst, uc);
+ }
+
+ Traits::advanceByte(src, charsNeeded - 1);
+ return charsNeeded;
+ }
+}
+
+enum DataEndianness
+{
+ DetectEndianness,
+ BigEndianness,
+ LittleEndianness
+};
+
+struct QUtf8
+{
+ static QChar *convertToUnicode(QChar *, const char *, qsizetype) noexcept;
+ static QString convertToUnicode(const char *, qsizetype);
+ static QString convertToUnicode(const char *, qsizetype, QStringConverter::State *);
+ static QByteArray convertFromUnicode(const QChar *, qsizetype);
+ static QByteArray convertFromUnicode(const QChar *, qsizetype, QStringConverter::State *);
+ struct ValidUtf8Result {
+ bool isValidUtf8;
+ bool isValidAscii;
+ };
+ static ValidUtf8Result isValidUtf8(const char *, qsizetype);
+ static int compareUtf8(const char *, qsizetype, const QChar *, qsizetype);
+ static int compareUtf8(const char *, qsizetype, QLatin1String s);
+};
+
+struct QUtf16
+{
+ static QString convertToUnicode(const char *, qsizetype, QStringConverter::State *, DataEndianness = DetectEndianness);
+ static QByteArray convertFromUnicode(const QChar *, qsizetype, QStringConverter::State *, DataEndianness = DetectEndianness);
+};
+
+struct QUtf32
+{
+ static QString convertToUnicode(const char *, qsizetype, QStringConverter::State *, DataEndianness = DetectEndianness);
+ static QByteArray convertFromUnicode(const QChar *, qsizetype, QStringConverter::State *, DataEndianness = DetectEndianness);
+};
+
+/*
+ Converts from different utf encodings looking at a possible byte order mark at the
+ beginning of the string. If no BOM exists, utf-8 is assumed.
+ */
+Q_CORE_EXPORT QString qFromUtfEncoded(const QByteArray &ba);
+
+QT_END_NAMESPACE
+
+#endif // QSTRINGCONVERTER_P_H
diff --git a/src/corelib/text/text.pri b/src/corelib/text/text.pri
index 4c584cf958..1275c014a8 100644
--- a/src/corelib/text/text.pri
+++ b/src/corelib/text/text.pri
@@ -20,6 +20,7 @@ HEADERS += \
text/qstringalgorithms_p.h \
text/qstringbuilder.h \
text/qstringconverter.h \
+ text/qstringconverter_p.h \
text/qstringiterator_p.h \
text/qstringlist.h \
text/qstringliteral.h \
diff --git a/src/gui/kernel/qclipboard.cpp b/src/gui/kernel/qclipboard.cpp
index 72f27d3e49..3b42e78624 100644
--- a/src/gui/kernel/qclipboard.cpp
+++ b/src/gui/kernel/qclipboard.cpp
@@ -46,9 +46,7 @@
#include "qvariant.h"
#include "qbuffer.h"
#include "qimage.h"
-#if QT_CONFIG(textcodec)
-#include "private/qutfcodec_p.h"
-#endif
+#include "private/qstringconverter_p.h"
#include "private/qguiapplication_p.h"
#include <qpa/qplatformintegration.h>
diff --git a/src/tools/bootstrap/.prev_CMakeLists.txt b/src/tools/bootstrap/.prev_CMakeLists.txt
index 8f430c494e..f81e03adff 100644
--- a/src/tools/bootstrap/.prev_CMakeLists.txt
+++ b/src/tools/bootstrap/.prev_CMakeLists.txt
@@ -41,9 +41,6 @@ qt_add_module(Bootstrap
../../3rdparty/pcre2/src/pcre2_ucp.h
../../3rdparty/pcre2/src/pcre2_valid_utf.c
../../3rdparty/pcre2/src/pcre2_xclass.c
- ../../corelib/codecs/qlatincodec.cpp
- ../../corelib/codecs/qtextcodec.cpp
- ../../corelib/codecs/qutfcodec.cpp
../../corelib/global/qendian.cpp
../../corelib/global/qglobal.cpp
../../corelib/global/qlogging.cpp
@@ -109,7 +106,6 @@ qt_add_module(Bootstrap
../../corelib/text/qstringbuilder.cpp
../../corelib/text/qstringconverter.cpp
../../corelib/text/qstringlist.cpp
- ../../corelib/text/qstringview.cpp
../../corelib/text/qvsnprintf.cpp
../../corelib/time/qcalendar.cpp
../../corelib/time/qdatetime.cpp
diff --git a/src/tools/bootstrap/CMakeLists.txt b/src/tools/bootstrap/CMakeLists.txt
index 5a17888003..a5184fbb80 100644
--- a/src/tools/bootstrap/CMakeLists.txt
+++ b/src/tools/bootstrap/CMakeLists.txt
@@ -42,9 +42,6 @@ qt_extend_target(Bootstrap
../../3rdparty/pcre2/src/pcre2_ucp.h
../../3rdparty/pcre2/src/pcre2_valid_utf.c
../../3rdparty/pcre2/src/pcre2_xclass.c
- ../../corelib/codecs/qlatincodec.cpp
- ../../corelib/codecs/qtextcodec.cpp
- ../../corelib/codecs/qutfcodec.cpp
../../corelib/global/qendian.cpp
../../corelib/global/qglobal.cpp
../../corelib/global/qlogging.cpp
@@ -110,7 +107,6 @@ qt_extend_target(Bootstrap
../../corelib/text/qstringbuilder.cpp
../../corelib/text/qstringconverter.cpp
../../corelib/text/qstringlist.cpp
- ../../corelib/text/qstringview.cpp
../../corelib/text/qvsnprintf.cpp
../../corelib/time/qcalendar.cpp
../../corelib/time/qdatetime.cpp
diff --git a/src/tools/bootstrap/bootstrap.pro b/src/tools/bootstrap/bootstrap.pro
index 169c5fe1c2..5b7da8e687 100644
--- a/src/tools/bootstrap/bootstrap.pro
+++ b/src/tools/bootstrap/bootstrap.pro
@@ -28,9 +28,6 @@ INCLUDEPATH += \
$$PWD/../../3rdparty/pcre2/src
SOURCES += \
- ../../corelib/codecs/qlatincodec.cpp \
- ../../corelib/codecs/qtextcodec.cpp \
- ../../corelib/codecs/qutfcodec.cpp \
../../corelib/global/qendian.cpp \
../../corelib/global/qglobal.cpp \
../../corelib/global/qlogging.cpp \
@@ -96,7 +93,6 @@ SOURCES += \
../../corelib/text/qstringconverter.cpp \
../../corelib/text/qstring_compat.cpp \
../../corelib/text/qstringlist.cpp \
- ../../corelib/text/qstringview.cpp \
../../corelib/text/qvsnprintf.cpp \
../../corelib/time/qcalendar.cpp \
../../corelib/time/qdatetime.cpp \