summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLars Knoll <lars.knoll@qt.io>2020-04-17 12:10:21 +0200
committerLars Knoll <lars.knoll@qt.io>2020-05-14 07:46:38 +0200
commitea0a08c898fed9cfd8d8eb16613e352740d3eb02 (patch)
tree38c2fde7a8ca9266bda81a7a454f298a7e3b6aff
parentf64a6bd638d399403845fe52e6f8e52889f1f52b (diff)
Move the UTF conversion methods to qstringconverter
Separate them from the qutfcodec, so that the codec can later on be moved out of Qt Core. Fix the QUtf methods to take qsizetype instead of int for length arguments. This also makes it possible to not build QTextCodec into the bootstrap lib anymore. Change-Id: I0b4f83139d61b19c651520a2f3a5012aa7e85cb8 Reviewed-by: Thiago Macieira <thiago.macieira@intel.com>
-rw-r--r--qmake/CMakeLists.txt2
-rw-r--r--qmake/Makefile.unix11
-rw-r--r--qmake/Makefile.win322
-rw-r--r--qmake/qmake.pro5
-rw-r--r--src/corelib/CMakeLists.txt1
-rw-r--r--src/corelib/codecs/qutfcodec.cpp940
-rw-r--r--src/corelib/codecs/qutfcodec_p.h262
-rw-r--r--src/corelib/global/qconfig-bootstrapped.h5
-rw-r--r--src/corelib/io/qfilesystemiterator_unix.cpp2
-rw-r--r--src/corelib/io/qurlrecode.cpp2
-rw-r--r--src/corelib/serialization/qcborstreamreader.cpp2
-rw-r--r--src/corelib/serialization/qcborvalue_p.h2
-rw-r--r--src/corelib/serialization/qjsonparser.cpp2
-rw-r--r--src/corelib/serialization/qjsonwriter.cpp2
-rw-r--r--src/corelib/text/qstring.cpp2
-rw-r--r--src/corelib/text/qstringbuilder.cpp2
-rw-r--r--src/corelib/text/qstringconverter.cpp949
-rw-r--r--src/corelib/text/qstringconverter_p.h323
-rw-r--r--src/corelib/text/text.pri1
-rw-r--r--src/gui/kernel/qclipboard.cpp4
-rw-r--r--src/tools/bootstrap/.prev_CMakeLists.txt4
-rw-r--r--src/tools/bootstrap/CMakeLists.txt4
-rw-r--r--src/tools/bootstrap/bootstrap.pro4
23 files changed, 1294 insertions, 1239 deletions
diff --git a/qmake/CMakeLists.txt b/qmake/CMakeLists.txt
index d724b44d59..fa7d50f234 100644
--- a/qmake/CMakeLists.txt
+++ b/qmake/CMakeLists.txt
@@ -40,7 +40,6 @@ qt_add_tool(qmake # special case
../src/3rdparty/pcre2/src/pcre2_ucp.h
../src/3rdparty/pcre2/src/pcre2_valid_utf.c
../src/3rdparty/pcre2/src/pcre2_xclass.c
- ../src/corelib/codecs/qutfcodec.cpp ../src/corelib/codecs/qutfcodec_p.h
../src/corelib/global/qendian.cpp # special case
../src/corelib/global/qglobal.cpp ../src/corelib/global/qglobal.h
../src/corelib/global/qlibraryinfo.cpp
@@ -105,6 +104,7 @@ qt_add_tool(qmake # special case
../src/corelib/tools/qringbuffer.cpp # special case
../src/corelib/text/qstring.cpp ../src/corelib/text/qstring.h
../src/corelib/text/qstringbuilder.cpp ../src/corelib/text/qstringbuilder.h
+ ../src/corelib/text/qstringconverter.cpp ../src/corelib/text/qstringconverter.h ../src/corelib/text/qstringconverter_p.h
../src/corelib/text/qstringlist.cpp ../src/corelib/text/qstringlist.h
../src/corelib/text/qstringmatcher.h
../src/corelib/tools/qvector.h
diff --git a/qmake/Makefile.unix b/qmake/Makefile.unix
index 98d255f2d5..c0b6704351 100644
--- a/qmake/Makefile.unix
+++ b/qmake/Makefile.unix
@@ -17,7 +17,6 @@ OBJS = \
#qt code (please keep in order matching DEPEND_SRC)
QOBJS = \
- qutfcodec.o \
qendian.o qglobal.o qlogging.o qmalloc.o qnumeric.o qoperatingsystemversion.o qrandom.o \
qabstractfileengine.o qbuffer.o qdatastream.o qdebug.o \
qdir.o qdiriterator.o \
@@ -32,7 +31,7 @@ QOBJS = \
qcalendar.o qgregoriancalendar.o qromancalendar.o \
qcryptographichash.o qdatetime.o qhash.o \
qlocale.o qlocale_tools.o qmap.o qregularexpression.o qregexp.o qringbuffer.o \
- qstringbuilder.o qstring.o qstringlist.o qversionnumber.o \
+ qstringbuilder.o qstring.o qstringconverter.o qstringlist.o qversionnumber.o \
qvsnprintf.o qxmlstream.o qxmlutils.o \
pcre2_auto_possess.o pcre2_chartables.o pcre2_compile.o pcre2_config.o \
pcre2_context.o pcre2_dfa_match.o pcre2_error.o pcre2_extuni.o \
@@ -74,7 +73,6 @@ DEPEND_SRC = \
$(QMKGENSRC)/win32/msvc_vcxproj.cpp \
$(QMKGENSRC)/win32/winmakefile.cpp \
$(QMKGENSRC)/xmloutput.cpp \
- $(SOURCE_PATH)/src/corelib/codecs/qutfcodec.cpp \
$(SOURCE_PATH)/src/corelib/global/qendian.cpp \
$(SOURCE_PATH)/src/corelib/global/qglobal.cpp \
$(SOURCE_PATH)/src/corelib/global/qlibraryinfo.cpp \
@@ -122,6 +120,7 @@ DEPEND_SRC = \
$(SOURCE_PATH)/src/corelib/text/qregularexpression.cpp \
$(SOURCE_PATH)/src/corelib/text/qregexp.cpp \
$(SOURCE_PATH)/src/corelib/text/qstringbuilder.cpp \
+ $(SOURCE_PATH)/src/corelib/text/qstringconverter.cpp \
$(SOURCE_PATH)/src/corelib/text/qstring.cpp \
$(SOURCE_PATH)/src/corelib/text/qstringlist.cpp \
$(SOURCE_PATH)/src/corelib/text/qvsnprintf.cpp \
@@ -380,15 +379,15 @@ qoperatingsystemversion_darwin.o: $(SOURCE_PATH)/src/corelib/global/qoperatingsy
qcore_foundation.o: $(SOURCE_PATH)/src/corelib/kernel/qcore_foundation.mm
$(CXX) -c -o $@ $(CXXFLAGS) $<
-qutfcodec.o: $(SOURCE_PATH)/src/corelib/codecs/qutfcodec.cpp
- $(CXX) -c -o $@ $(CXXFLAGS) $<
-
qstring.o: $(SOURCE_PATH)/src/corelib/text/qstring.cpp
$(CXX) -c -o $@ $(CXXFLAGS) $<
qstringbuilder.o: $(SOURCE_PATH)/src/corelib/text/qstringbuilder.cpp
$(CXX) -c -o $@ $(CXXFLAGS) $<
+qstringconverter.o: $(SOURCE_PATH)/src/corelib/text/qstringconverter.cpp
+ $(CXX) -c -o $@ $(CXXFLAGS) $<
+
qlocale.o: $(SOURCE_PATH)/src/corelib/text/qlocale.cpp
$(CXX) -c -o $@ $(CXXFLAGS) $<
diff --git a/qmake/Makefile.win32 b/qmake/Makefile.win32
index d3a85c17b2..df47dacd15 100644
--- a/qmake/Makefile.win32
+++ b/qmake/Makefile.win32
@@ -104,8 +104,8 @@ QTOBJS= \
qoperatingsystemversion_win.obj \
qregexp.obj \
qromancalendar.obj \
- qutfcodec.obj \
qstring.obj \
+ qstringconverter.obj \
qstringlist.obj \
qstringbuilder.obj \
qsystemerror.obj \
diff --git a/qmake/qmake.pro b/qmake/qmake.pro
index 243f07ac2c..fcd1c17dcf 100644
--- a/qmake/qmake.pro
+++ b/qmake/qmake.pro
@@ -159,11 +159,11 @@ SOURCES += \
qsettings.cpp \
qstring.cpp \
qstringbuilder.cpp \
+ qstringconverter.cpp \
qstringlist.cpp \
qsystemerror.cpp \
qtemporaryfile.cpp \
qtextstream.cpp \
- qutfcodec.cpp \
quuid.cpp \
qvariant.cpp \
qversionnumber.cpp \
@@ -217,12 +217,13 @@ HEADERS += \
qromancalendar_p.h \
qstring.h \
qstringbuilder.h \
+ qstringconverter_p.h \
+ qstringconverter.h \
qstringlist.h \
qstringmatcher.h \
qsystemerror_p.h \
qtemporaryfile.h \
qtextstream.h \
- qutfcodec_p.h \
quuid.h \
qvector.h \
qversionnumber.h \
diff --git a/src/corelib/CMakeLists.txt b/src/corelib/CMakeLists.txt
index ff28b2d20c..710d025caf 100644
--- a/src/corelib/CMakeLists.txt
+++ b/src/corelib/CMakeLists.txt
@@ -169,6 +169,7 @@ qt_add_module(Core
text/qstring.cpp text/qstring.h
text/qstring_compat.cpp
text/qstringalgorithms.h text/qstringalgorithms_p.h
+ text/qstringconverter.cpp text/qstringconverter.h text/qstringconverter_p.h
text/qstringbuilder.cpp text/qstringbuilder.h
text/qstringiterator_p.h
text/qstringlist.cpp text/qstringlist.h
diff --git a/src/corelib/codecs/qutfcodec.cpp b/src/corelib/codecs/qutfcodec.cpp
index a31bfbd218..c518ab1d9c 100644
--- a/src/corelib/codecs/qutfcodec.cpp
+++ b/src/corelib/codecs/qutfcodec.cpp
@@ -48,946 +48,6 @@
QT_BEGIN_NAMESPACE
-enum { Endian = 0, Data = 1 };
-
-static const uchar utf8bom[] = { 0xef, 0xbb, 0xbf };
-
-#if (defined(__SSE2__) && defined(QT_COMPILER_SUPPORTS_SSE2)) \
- || (defined(__ARM_NEON__) && defined(Q_PROCESSOR_ARM_64))
-static Q_ALWAYS_INLINE uint qBitScanReverse(unsigned v) noexcept
-{
- uint result = qCountLeadingZeroBits(v);
- // Now Invert the result: clz will count *down* from the msb to the lsb, so the msb index is 31
- // and the lsb index is 0. The result for _bit_scan_reverse is expected to be the index when
- // counting up: msb index is 0 (because it starts there), and the lsb index is 31.
- result ^= sizeof(unsigned) * 8 - 1;
- return result;
-}
-#endif
-
-#if defined(__SSE2__) && defined(QT_COMPILER_SUPPORTS_SSE2)
-static inline bool simdEncodeAscii(uchar *&dst, const ushort *&nextAscii, const ushort *&src, const ushort *end)
-{
- // do sixteen characters at a time
- for ( ; end - src >= 16; src += 16, dst += 16) {
-# ifdef __AVX2__
- __m256i data = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src));
- __m128i data1 = _mm256_castsi256_si128(data);
- __m128i data2 = _mm256_extracti128_si256(data, 1);
-# else
- __m128i data1 = _mm_loadu_si128((const __m128i*)src);
- __m128i data2 = _mm_loadu_si128(1+(const __m128i*)src);
-# endif
-
- // check if everything is ASCII
- // the highest ASCII value is U+007F
- // Do the packing directly:
- // The PACKUSWB instruction has packs a signed 16-bit integer to an unsigned 8-bit
- // with saturation. That is, anything from 0x0100 to 0x7fff is saturated to 0xff,
- // while all negatives (0x8000 to 0xffff) get saturated to 0x00. To detect non-ASCII,
- // we simply do a signed greater-than comparison to 0x00. That means we detect NULs as
- // "non-ASCII", but it's an acceptable compromise.
- __m128i packed = _mm_packus_epi16(data1, data2);
- __m128i nonAscii = _mm_cmpgt_epi8(packed, _mm_setzero_si128());
-
- // store, even if there are non-ASCII characters here
- _mm_storeu_si128((__m128i*)dst, packed);
-
- // n will contain 1 bit set per character in [data1, data2] that is non-ASCII (or NUL)
- ushort n = ~_mm_movemask_epi8(nonAscii);
- if (n) {
- // find the next probable ASCII character
- // we don't want to load 32 bytes again in this loop if we know there are non-ASCII
- // characters still coming
- nextAscii = src + qBitScanReverse(n) + 1;
-
- n = qCountTrailingZeroBits(n);
- dst += n;
- src += n;
- return false;
- }
- }
-
- if (end - src >= 8) {
- // do eight characters at a time
- __m128i data = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src));
- __m128i packed = _mm_packus_epi16(data, data);
- __m128i nonAscii = _mm_cmpgt_epi8(packed, _mm_setzero_si128());
-
- // store even non-ASCII
- _mm_storel_epi64(reinterpret_cast<__m128i *>(dst), packed);
-
- uchar n = ~_mm_movemask_epi8(nonAscii);
- if (n) {
- nextAscii = src + qBitScanReverse(n) + 1;
- n = qCountTrailingZeroBits(n);
- dst += n;
- src += n;
- return false;
- }
- }
-
- return src == end;
-}
-
-static inline bool simdDecodeAscii(ushort *&dst, const uchar *&nextAscii, const uchar *&src, const uchar *end)
-{
- // do sixteen characters at a time
- for ( ; end - src >= 16; src += 16, dst += 16) {
- __m128i data = _mm_loadu_si128((const __m128i*)src);
-
-#ifdef __AVX2__
- const int BitSpacing = 2;
- // load and zero extend to an YMM register
- const __m256i extended = _mm256_cvtepu8_epi16(data);
-
- uint n = _mm256_movemask_epi8(extended);
- if (!n) {
- // store
- _mm256_storeu_si256((__m256i*)dst, extended);
- continue;
- }
-#else
- const int BitSpacing = 1;
-
- // check if everything is ASCII
- // movemask extracts the high bit of every byte, so n is non-zero if something isn't ASCII
- uint n = _mm_movemask_epi8(data);
- if (!n) {
- // unpack
- _mm_storeu_si128((__m128i*)dst, _mm_unpacklo_epi8(data, _mm_setzero_si128()));
- _mm_storeu_si128(1+(__m128i*)dst, _mm_unpackhi_epi8(data, _mm_setzero_si128()));
- continue;
- }
-#endif
-
- // copy the front part that is still ASCII
- while (!(n & 1)) {
- *dst++ = *src++;
- n >>= BitSpacing;
- }
-
- // find the next probable ASCII character
- // we don't want to load 16 bytes again in this loop if we know there are non-ASCII
- // characters still coming
- n = qBitScanReverse(n);
- nextAscii = src + (n / BitSpacing) + 1;
- return false;
-
- }
-
- if (end - src >= 8) {
- __m128i data = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(src));
- uint n = _mm_movemask_epi8(data) & 0xff;
- if (!n) {
- // unpack and store
- _mm_storeu_si128(reinterpret_cast<__m128i *>(dst), _mm_unpacklo_epi8(data, _mm_setzero_si128()));
- } else {
- while (!(n & 1)) {
- *dst++ = *src++;
- n >>= 1;
- }
-
- n = qBitScanReverse(n);
- nextAscii = src + n + 1;
- return false;
- }
- }
-
- return src == end;
-}
-
-static inline const uchar *simdFindNonAscii(const uchar *src, const uchar *end, const uchar *&nextAscii)
-{
-#ifdef __AVX2__
- // do 32 characters at a time
- // (this is similar to simdTestMask in qstring.cpp)
- const __m256i mask = _mm256_set1_epi8(0x80);
- for ( ; end - src >= 32; src += 32) {
- __m256i data = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src));
- if (_mm256_testz_si256(mask, data))
- continue;
-
- uint n = _mm256_movemask_epi8(data);
- Q_ASSUME(n);
-
- // find the next probable ASCII character
- // we don't want to load 32 bytes again in this loop if we know there are non-ASCII
- // characters still coming
- nextAscii = src + qBitScanReverse(n) + 1;
-
- // return the non-ASCII character
- return src + qCountTrailingZeroBits(n);
- }
-#endif
-
- // do sixteen characters at a time
- for ( ; end - src >= 16; src += 16) {
- __m128i data = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src));
-
- // check if everything is ASCII
- // movemask extracts the high bit of every byte, so n is non-zero if something isn't ASCII
- uint n = _mm_movemask_epi8(data);
- if (!n)
- continue;
-
- // find the next probable ASCII character
- // we don't want to load 16 bytes again in this loop if we know there are non-ASCII
- // characters still coming
- nextAscii = src + qBitScanReverse(n) + 1;
-
- // return the non-ASCII character
- return src + qCountTrailingZeroBits(n);
- }
-
- // do four characters at a time
- for ( ; end - src >= 4; src += 4) {
- quint32 data = qFromUnaligned<quint32>(src);
- data &= 0x80808080U;
- if (!data)
- continue;
-
- // We don't try to guess which of the three bytes is ASCII and which
- // one isn't. The chance that at least two of them are non-ASCII is
- // better than 75%.
- nextAscii = src;
- return src;
- }
- nextAscii = end;
- return src;
-}
-#elif defined(__ARM_NEON__) && defined(Q_PROCESSOR_ARM_64) // vaddv is only available on Aarch64
-static inline bool simdEncodeAscii(uchar *&dst, const ushort *&nextAscii, const ushort *&src, const ushort *end)
-{
- uint16x8_t maxAscii = vdupq_n_u16(0x7f);
- uint16x8_t mask1 = { 1, 1 << 2, 1 << 4, 1 << 6, 1 << 8, 1 << 10, 1 << 12, 1 << 14 };
- uint16x8_t mask2 = vshlq_n_u16(mask1, 1);
-
- // do sixteen characters at a time
- for ( ; end - src >= 16; src += 16, dst += 16) {
- // load 2 lanes (or: "load interleaved")
- uint16x8x2_t in = vld2q_u16(src);
-
- // check if any of the elements > 0x7f, select 1 bit per element (element 0 -> bit 0, element 1 -> bit 1, etc),
- // add those together into a scalar, and merge the scalars.
- uint16_t nonAscii = vaddvq_u16(vandq_u16(vcgtq_u16(in.val[0], maxAscii), mask1))
- | vaddvq_u16(vandq_u16(vcgtq_u16(in.val[1], maxAscii), mask2));
-
- // merge the two lanes by shifting the values of the second by 8 and inserting them
- uint16x8_t out = vsliq_n_u16(in.val[0], in.val[1], 8);
-
- // store, even if there are non-ASCII characters here
- vst1q_u8(dst, vreinterpretq_u8_u16(out));
-
- if (nonAscii) {
- // find the next probable ASCII character
- // we don't want to load 32 bytes again in this loop if we know there are non-ASCII
- // characters still coming
- nextAscii = src + qBitScanReverse(nonAscii) + 1;
-
- nonAscii = qCountTrailingZeroBits(nonAscii);
- dst += nonAscii;
- src += nonAscii;
- return false;
- }
- }
- return src == end;
-}
-
-static inline bool simdDecodeAscii(ushort *&dst, const uchar *&nextAscii, const uchar *&src, const uchar *end)
-{
- // do eight characters at a time
- uint8x8_t msb_mask = vdup_n_u8(0x80);
- uint8x8_t add_mask = { 1, 1 << 1, 1 << 2, 1 << 3, 1 << 4, 1 << 5, 1 << 6, 1 << 7 };
- for ( ; end - src >= 8; src += 8, dst += 8) {
- uint8x8_t c = vld1_u8(src);
- uint8_t n = vaddv_u8(vand_u8(vcge_u8(c, msb_mask), add_mask));
- if (!n) {
- // store
- vst1q_u16(dst, vmovl_u8(c));
- continue;
- }
-
- // copy the front part that is still ASCII
- while (!(n & 1)) {
- *dst++ = *src++;
- n >>= 1;
- }
-
- // find the next probable ASCII character
- // we don't want to load 16 bytes again in this loop if we know there are non-ASCII
- // characters still coming
- n = qBitScanReverse(n);
- nextAscii = src + n + 1;
- return false;
-
- }
- return src == end;
-}
-
-static inline const uchar *simdFindNonAscii(const uchar *src, const uchar *end, const uchar *&nextAscii)
-{
- // The SIMD code below is untested, so just force an early return until
- // we've had the time to verify it works.
- nextAscii = end;
- return src;
-
- // do eight characters at a time
- uint8x8_t msb_mask = vdup_n_u8(0x80);
- uint8x8_t add_mask = { 1, 1 << 1, 1 << 2, 1 << 3, 1 << 4, 1 << 5, 1 << 6, 1 << 7 };
- for ( ; end - src >= 8; src += 8) {
- uint8x8_t c = vld1_u8(src);
- uint8_t n = vaddv_u8(vand_u8(vcge_u8(c, msb_mask), add_mask));
- if (!n)
- continue;
-
- // find the next probable ASCII character
- // we don't want to load 16 bytes again in this loop if we know there are non-ASCII
- // characters still coming
- nextAscii = src + qBitScanReverse(n) + 1;
-
- // return the non-ASCII character
- return src + qCountTrailingZeroBits(n);
- }
- nextAscii = end;
- return src;
-}
-#else
-static inline bool simdEncodeAscii(uchar *, const ushort *, const ushort *, const ushort *)
-{
- return false;
-}
-
-static inline bool simdDecodeAscii(ushort *, const uchar *, const uchar *, const uchar *)
-{
- return false;
-}
-
-static inline const uchar *simdFindNonAscii(const uchar *src, const uchar *end, const uchar *&nextAscii)
-{
- nextAscii = end;
- return src;
-}
-#endif
-
-QByteArray QUtf8::convertFromUnicode(const QChar *uc, int len)
-{
- // create a QByteArray with the worst case scenario size
- QByteArray result(len * 3, Qt::Uninitialized);
- uchar *dst = reinterpret_cast<uchar *>(const_cast<char *>(result.constData()));
- const ushort *src = reinterpret_cast<const ushort *>(uc);
- const ushort *const end = src + len;
-
- while (src != end) {
- const ushort *nextAscii = end;
- if (simdEncodeAscii(dst, nextAscii, src, end))
- break;
-
- do {
- ushort uc = *src++;
- int res = QUtf8Functions::toUtf8<QUtf8BaseTraits>(uc, dst, src, end);
- if (res < 0) {
- // encoding error - append '?'
- *dst++ = '?';
- }
- } while (src < nextAscii);
- }
-
- result.truncate(dst - reinterpret_cast<uchar *>(const_cast<char *>(result.constData())));
- return result;
-}
-
-QByteArray QUtf8::convertFromUnicode(const QChar *uc, int len, QTextCodec::ConverterState *state)
-{
- uchar replacement = '?';
- int rlen = 3*len;
- int surrogate_high = -1;
- if (state) {
- if (state->flags & QTextCodec::ConvertInvalidToNull)
- replacement = 0;
- if (!(state->flags & QTextCodec::IgnoreHeader))
- rlen += 3;
- if (state->remainingChars)
- surrogate_high = state->state_data[0];
- }
-
-
- QByteArray rstr(rlen, Qt::Uninitialized);
- uchar *cursor = reinterpret_cast<uchar *>(const_cast<char *>(rstr.constData()));
- const ushort *src = reinterpret_cast<const ushort *>(uc);
- const ushort *const end = src + len;
-
- int invalid = 0;
- if (state && !(state->flags & QTextCodec::IgnoreHeader)) {
- // append UTF-8 BOM
- *cursor++ = utf8bom[0];
- *cursor++ = utf8bom[1];
- *cursor++ = utf8bom[2];
- }
-
- const ushort *nextAscii = src;
- while (src != end) {
- int res;
- ushort uc;
- if (surrogate_high != -1) {
- uc = surrogate_high;
- surrogate_high = -1;
- res = QUtf8Functions::toUtf8<QUtf8BaseTraits>(uc, cursor, src, end);
- } else {
- if (src >= nextAscii && simdEncodeAscii(cursor, nextAscii, src, end))
- break;
-
- uc = *src++;
- res = QUtf8Functions::toUtf8<QUtf8BaseTraits>(uc, cursor, src, end);
- }
- if (Q_LIKELY(res >= 0))
- continue;
-
- if (res == QUtf8BaseTraits::Error) {
- // encoding error
- ++invalid;
- *cursor++ = replacement;
- } else if (res == QUtf8BaseTraits::EndOfString) {
- surrogate_high = uc;
- break;
- }
- }
-
- rstr.resize(cursor - (const uchar*)rstr.constData());
- if (state) {
- state->invalidChars += invalid;
- state->flags |= QTextCodec::IgnoreHeader;
- state->remainingChars = 0;
- if (surrogate_high >= 0) {
- state->remainingChars = 1;
- state->state_data[0] = surrogate_high;
- }
- }
- return rstr;
-}
-
-QString QUtf8::convertToUnicode(const char *chars, int len)
-{
- // UTF-8 to UTF-16 always needs the exact same number of words or less:
- // UTF-8 UTF-16
- // 1 byte 1 word
- // 2 bytes 1 word
- // 3 bytes 1 word
- // 4 bytes 2 words (one surrogate pair)
- // That is, we'll use the full buffer if the input is US-ASCII (1-byte UTF-8),
- // half the buffer for U+0080-U+07FF text (e.g., Greek, Cyrillic, Arabic) or
- // non-BMP text, and one third of the buffer for U+0800-U+FFFF text (e.g, CJK).
- //
- // The table holds for invalid sequences too: we'll insert one replacement char
- // per invalid byte.
- QString result(len, Qt::Uninitialized);
- QChar *data = const_cast<QChar*>(result.constData()); // we know we're not shared
- const QChar *end = convertToUnicode(data, chars, len);
- result.truncate(end - data);
- return result;
-}
-
-/*!
- \since 5.7
- \overload
-
- Converts the UTF-8 sequence of \a len octets beginning at \a chars to
- a sequence of QChar starting at \a buffer. The buffer is expected to be
- large enough to hold the result. An upper bound for the size of the
- buffer is \a len QChars.
-
- If, during decoding, an error occurs, a QChar::ReplacementCharacter is
- written.
-
- Returns a pointer to one past the last QChar written.
-
- This function never throws.
-*/
-
-QChar *QUtf8::convertToUnicode(QChar *buffer, const char *chars, int len) noexcept
-{
- ushort *dst = reinterpret_cast<ushort *>(buffer);
- const uchar *src = reinterpret_cast<const uchar *>(chars);
- const uchar *end = src + len;
-
- // attempt to do a full decoding in SIMD
- const uchar *nextAscii = end;
- if (!simdDecodeAscii(dst, nextAscii, src, end)) {
- // at least one non-ASCII entry
- // check if we failed to decode the UTF-8 BOM; if so, skip it
- if (Q_UNLIKELY(src == reinterpret_cast<const uchar *>(chars))
- && end - src >= 3
- && Q_UNLIKELY(src[0] == utf8bom[0] && src[1] == utf8bom[1] && src[2] == utf8bom[2])) {
- src += 3;
- }
-
- while (src < end) {
- nextAscii = end;
- if (simdDecodeAscii(dst, nextAscii, src, end))
- break;
-
- do {
- uchar b = *src++;
- int res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(b, dst, src, end);
- if (res < 0) {
- // decoding error
- *dst++ = QChar::ReplacementCharacter;
- }
- } while (src < nextAscii);
- }
- }
-
- return reinterpret_cast<QChar *>(dst);
-}
-
-QString QUtf8::convertToUnicode(const char *chars, int len, QTextCodec::ConverterState *state)
-{
- bool headerdone = false;
- ushort replacement = QChar::ReplacementCharacter;
- int invalid = 0;
- int res;
- uchar ch = 0;
-
- // See above for buffer requirements for stateless decoding. However, that
- // fails if the state is not empty. The following situations can add to the
- // requirements:
- // state contains chars starts with requirement
- // 1 of 2 bytes valid continuation 0
- // 2 of 3 bytes same 0
- // 3 bytes of 4 same +1 (need to insert surrogate pair)
- // 1 of 2 bytes invalid continuation +1 (need to insert replacement and restart)
- // 2 of 3 bytes same +1 (same)
- // 3 of 4 bytes same +1 (same)
- QString result(len + 1, Qt::Uninitialized);
-
- ushort *dst = reinterpret_cast<ushort *>(const_cast<QChar *>(result.constData()));
- const uchar *src = reinterpret_cast<const uchar *>(chars);
- const uchar *end = src + len;
-
- if (state) {
- if (state->flags & QTextCodec::IgnoreHeader)
- headerdone = true;
- if (state->flags & QTextCodec::ConvertInvalidToNull)
- replacement = QChar::Null;
- if (state->remainingChars) {
- // handle incoming state first
- uchar remainingCharsData[4]; // longest UTF-8 sequence possible
- int remainingCharsCount = state->remainingChars;
- int newCharsToCopy = qMin<int>(sizeof(remainingCharsData) - remainingCharsCount, end - src);
-
- memset(remainingCharsData, 0, sizeof(remainingCharsData));
- memcpy(remainingCharsData, &state->state_data[0], remainingCharsCount);
- memcpy(remainingCharsData + remainingCharsCount, src, newCharsToCopy);
-
- const uchar *begin = &remainingCharsData[1];
- res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(remainingCharsData[0], dst, begin,
- static_cast<const uchar *>(remainingCharsData) + remainingCharsCount + newCharsToCopy);
- if (res == QUtf8BaseTraits::Error || (res == QUtf8BaseTraits::EndOfString && len == 0)) {
- // special case for len == 0:
- // if we were supplied an empty string, terminate the previous, unfinished sequence with error
- ++invalid;
- *dst++ = replacement;
- } else if (res == QUtf8BaseTraits::EndOfString) {
- // if we got EndOfString again, then there were too few bytes in src;
- // copy to our state and return
- state->remainingChars = remainingCharsCount + newCharsToCopy;
- memcpy(&state->state_data[0], remainingCharsData, state->remainingChars);
- return QString();
- } else if (!headerdone && res >= 0) {
- // eat the UTF-8 BOM
- headerdone = true;
- if (dst[-1] == 0xfeff)
- --dst;
- }
-
- // adjust src now that we have maybe consumed a few chars
- if (res >= 0) {
- Q_ASSERT(res > remainingCharsCount);
- src += res - remainingCharsCount;
- }
- }
- }
-
- // main body, stateless decoding
- res = 0;
- const uchar *nextAscii = src;
- const uchar *start = src;
- while (res >= 0 && src < end) {
- if (src >= nextAscii && simdDecodeAscii(dst, nextAscii, src, end))
- break;
-
- ch = *src++;
- res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(ch, dst, src, end);
- if (!headerdone && res >= 0) {
- headerdone = true;
- if (src == start + 3) { // 3 == sizeof(utf8-bom)
- // eat the UTF-8 BOM (it can only appear at the beginning of the string).
- if (dst[-1] == 0xfeff)
- --dst;
- }
- }
- if (res == QUtf8BaseTraits::Error) {
- res = 0;
- ++invalid;
- *dst++ = replacement;
- }
- }
-
- if (!state && res == QUtf8BaseTraits::EndOfString) {
- // unterminated UTF sequence
- *dst++ = QChar::ReplacementCharacter;
- while (src++ < end)
- *dst++ = QChar::ReplacementCharacter;
- }
-
- result.truncate(dst - (const ushort *)result.unicode());
- if (state) {
- state->invalidChars += invalid;
- if (headerdone)
- state->flags |= QTextCodec::IgnoreHeader;
- if (res == QUtf8BaseTraits::EndOfString) {
- --src; // unread the byte in ch
- state->remainingChars = end - src;
- memcpy(&state->state_data[0], src, end - src);
- } else {
- state->remainingChars = 0;
- }
- }
- return result;
-}
-
-struct QUtf8NoOutputTraits : public QUtf8BaseTraitsNoAscii
-{
- struct NoOutput {};
- static void appendUtf16(const NoOutput &, ushort) {}
- static void appendUcs4(const NoOutput &, uint) {}
-};
-
-QUtf8::ValidUtf8Result QUtf8::isValidUtf8(const char *chars, qsizetype len)
-{
- const uchar *src = reinterpret_cast<const uchar *>(chars);
- const uchar *end = src + len;
- const uchar *nextAscii = src;
- bool isValidAscii = true;
-
- while (src < end) {
- if (src >= nextAscii)
- src = simdFindNonAscii(src, end, nextAscii);
- if (src == end)
- break;
-
- do {
- uchar b = *src++;
- if ((b & 0x80) == 0)
- continue;
-
- isValidAscii = false;
- QUtf8NoOutputTraits::NoOutput output;
- int res = QUtf8Functions::fromUtf8<QUtf8NoOutputTraits>(b, output, src, end);
- if (res < 0) {
- // decoding error
- return { false, false };
- }
- } while (src < nextAscii);
- }
-
- return { true, isValidAscii };
-}
-
-int QUtf8::compareUtf8(const char *utf8, qsizetype u8len, const QChar *utf16, int u16len)
-{
- uint uc1, uc2;
- auto src1 = reinterpret_cast<const uchar *>(utf8);
- auto end1 = src1 + u8len;
- QStringIterator src2(utf16, utf16 + u16len);
-
- while (src1 < end1 && src2.hasNext()) {
- uchar b = *src1++;
- uint *output = &uc1;
- int res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(b, output, src1, end1);
- if (res < 0) {
- // decoding error
- uc1 = QChar::ReplacementCharacter;
- }
-
- uc2 = src2.next();
- if (uc1 != uc2)
- return int(uc1) - int(uc2);
- }
-
- // the shorter string sorts first
- return (end1 > src1) - int(src2.hasNext());
-}
-
-int QUtf8::compareUtf8(const char *utf8, qsizetype u8len, QLatin1String s)
-{
- uint uc1;
- auto src1 = reinterpret_cast<const uchar *>(utf8);
- auto end1 = src1 + u8len;
- auto src2 = reinterpret_cast<const uchar *>(s.latin1());
- auto end2 = src2 + s.size();
-
- while (src1 < end1 && src2 < end2) {
- uchar b = *src1++;
- uint *output = &uc1;
- int res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(b, output, src1, end1);
- if (res < 0) {
- // decoding error
- uc1 = QChar::ReplacementCharacter;
- }
-
- uint uc2 = *src2++;
- if (uc1 != uc2)
- return int(uc1) - int(uc2);
- }
-
- // the shorter string sorts first
- return (end1 > src1) - (end2 > src2);
-}
-
-QByteArray QUtf16::convertFromUnicode(const QChar *uc, int len, QTextCodec::ConverterState *state, DataEndianness e)
-{
- DataEndianness endian = e;
- int length = 2*len;
- if (!state || (!(state->flags & QTextCodec::IgnoreHeader))) {
- length += 2;
- }
- if (e == DetectEndianness) {
- endian = (QSysInfo::ByteOrder == QSysInfo::BigEndian) ? BigEndianness : LittleEndianness;
- }
-
- QByteArray d;
- d.resize(length);
- char *data = d.data();
- if (!state || !(state->flags & QTextCodec::IgnoreHeader)) {
- QChar bom(QChar::ByteOrderMark);
- if (endian == BigEndianness)
- qToBigEndian(bom.unicode(), data);
- else
- qToLittleEndian(bom.unicode(), data);
- data += 2;
- }
- if (endian == BigEndianness)
- qToBigEndian<ushort>(uc, len, data);
- else
- qToLittleEndian<ushort>(uc, len, data);
-
- if (state) {
- state->remainingChars = 0;
- state->flags |= QTextCodec::IgnoreHeader;
- }
- return d;
-}
-
-QString QUtf16::convertToUnicode(const char *chars, int len, QTextCodec::ConverterState *state, DataEndianness e)
-{
- DataEndianness endian = e;
- bool half = false;
- uchar buf = 0;
- bool headerdone = false;
- if (state) {
- headerdone = state->flags & QTextCodec::IgnoreHeader;
- if (endian == DetectEndianness)
- endian = (DataEndianness)state->state_data[Endian];
- if (state->remainingChars) {
- half = true;
- buf = state->state_data[Data];
- }
- }
- if (headerdone && endian == DetectEndianness)
- endian = (QSysInfo::ByteOrder == QSysInfo::BigEndian) ? BigEndianness : LittleEndianness;
-
- QString result(len, Qt::Uninitialized); // worst case
- QChar *qch = (QChar *)result.data();
- while (len--) {
- if (half) {
- QChar ch;
- if (endian == LittleEndianness) {
- ch.setRow(*chars++);
- ch.setCell(buf);
- } else {
- ch.setRow(buf);
- ch.setCell(*chars++);
- }
- if (!headerdone) {
- headerdone = true;
- if (endian == DetectEndianness) {
- if (ch == QChar::ByteOrderSwapped) {
- endian = LittleEndianness;
- } else if (ch == QChar::ByteOrderMark) {
- endian = BigEndianness;
- } else {
- if (QSysInfo::ByteOrder == QSysInfo::BigEndian) {
- endian = BigEndianness;
- } else {
- endian = LittleEndianness;
- ch = QChar::fromUcs2((ch.unicode() >> 8) | ((ch.unicode() & 0xff) << 8));
- }
- *qch++ = ch;
- }
- } else if (ch != QChar::ByteOrderMark) {
- *qch++ = ch;
- }
- } else {
- *qch++ = ch;
- }
- half = false;
- } else {
- buf = *chars++;
- half = true;
- }
- }
- result.truncate(qch - result.unicode());
-
- if (state) {
- if (headerdone)
- state->flags |= QTextCodec::IgnoreHeader;
- state->state_data[Endian] = endian;
- if (half) {
- state->remainingChars = 1;
- state->state_data[Data] = buf;
- } else {
- state->remainingChars = 0;
- state->state_data[Data] = 0;
- }
- }
- return result;
-}
-
-QByteArray QUtf32::convertFromUnicode(const QChar *uc, int len, QTextCodec::ConverterState *state, DataEndianness e)
-{
- DataEndianness endian = e;
- int length = 4*len;
- if (!state || (!(state->flags & QTextCodec::IgnoreHeader))) {
- length += 4;
- }
- if (e == DetectEndianness) {
- endian = (QSysInfo::ByteOrder == QSysInfo::BigEndian) ? BigEndianness : LittleEndianness;
- }
-
- QByteArray d(length, Qt::Uninitialized);
- char *data = d.data();
- if (!state || !(state->flags & QTextCodec::IgnoreHeader)) {
- if (endian == BigEndianness) {
- data[0] = 0;
- data[1] = 0;
- data[2] = (char)0xfe;
- data[3] = (char)0xff;
- } else {
- data[0] = (char)0xff;
- data[1] = (char)0xfe;
- data[2] = 0;
- data[3] = 0;
- }
- data += 4;
- }
-
- QStringIterator i(uc, uc + len);
- if (endian == BigEndianness) {
- while (i.hasNext()) {
- uint cp = i.next();
- qToBigEndian(cp, data);
- data += 4;
- }
- } else {
- while (i.hasNext()) {
- uint cp = i.next();
- qToLittleEndian(cp, data);
- data += 4;
- }
- }
-
- if (state) {
- state->remainingChars = 0;
- state->flags |= QTextCodec::IgnoreHeader;
- }
- return d;
-}
-
-QString QUtf32::convertToUnicode(const char *chars, int len, QTextCodec::ConverterState *state, DataEndianness e)
-{
- DataEndianness endian = e;
- uchar tuple[4];
- int num = 0;
- bool headerdone = false;
- if (state) {
- headerdone = state->flags & QTextCodec::IgnoreHeader;
- if (endian == DetectEndianness) {
- endian = (DataEndianness)state->state_data[Endian];
- }
- num = state->remainingChars;
- memcpy(tuple, &state->state_data[Data], 4);
- }
- if (headerdone && endian == DetectEndianness)
- endian = (QSysInfo::ByteOrder == QSysInfo::BigEndian) ? BigEndianness : LittleEndianness;
-
- QString result;
- result.resize((num + len) >> 2 << 1); // worst case
- QChar *qch = (QChar *)result.data();
-
- const char *end = chars + len;
- while (chars < end) {
- tuple[num++] = *chars++;
- if (num == 4) {
- if (!headerdone) {
- headerdone = true;
- if (endian == DetectEndianness) {
- if (tuple[0] == 0xff && tuple[1] == 0xfe && tuple[2] == 0 && tuple[3] == 0 && endian != BigEndianness) {
- endian = LittleEndianness;
- num = 0;
- continue;
- } else if (tuple[0] == 0 && tuple[1] == 0 && tuple[2] == 0xfe && tuple[3] == 0xff && endian != LittleEndianness) {
- endian = BigEndianness;
- num = 0;
- continue;
- } else if (QSysInfo::ByteOrder == QSysInfo::BigEndian) {
- endian = BigEndianness;
- } else {
- endian = LittleEndianness;
- }
- } else if (((endian == BigEndianness) ? qFromBigEndian<quint32>(tuple) : qFromLittleEndian<quint32>(tuple)) == QChar::ByteOrderMark) {
- num = 0;
- continue;
- }
- }
- uint code = (endian == BigEndianness) ? qFromBigEndian<quint32>(tuple) : qFromLittleEndian<quint32>(tuple);
- for (char16_t c : QChar::fromUcs4(code))
- *qch++ = c;
- num = 0;
- }
- }
- result.truncate(qch - result.unicode());
-
- if (state) {
- if (headerdone)
- state->flags |= QTextCodec::IgnoreHeader;
- state->state_data[Endian] = endian;
- state->remainingChars = num;
- memcpy(&state->state_data[Data], tuple, 4);
- }
- return result;
-}
-
-QString qFromUtfEncoded(const QByteArray &ba)
-{
- const int arraySize = ba.size();
- const uchar *buf = reinterpret_cast<const uchar *>(ba.constData());
- const uint bom = 0xfeff;
-
- if (arraySize > 3) {
- uint uc = qFromUnaligned<uint>(buf);
- if (uc == qToBigEndian(bom) || uc == qToLittleEndian(bom))
- return QUtf32::convertToUnicode(ba.constData(), ba.length(), nullptr); // utf-32
- }
-
- if (arraySize > 1) {
- ushort uc = qFromUnaligned<ushort>(buf);
- if (uc == qToBigEndian(ushort(bom)) || qToLittleEndian(ushort(bom)))
- return QUtf16::convertToUnicode(ba.constData(), ba.length(), nullptr); // utf-16
- }
- return QUtf8::convertToUnicode(ba.constData(), ba.length());
-}
-
#if QT_CONFIG(textcodec)
QUtf8Codec::~QUtf8Codec()
diff --git a/src/corelib/codecs/qutfcodec_p.h b/src/corelib/codecs/qutfcodec_p.h
index b1c7a23d4f..893a6db8e1 100644
--- a/src/corelib/codecs/qutfcodec_p.h
+++ b/src/corelib/codecs/qutfcodec_p.h
@@ -60,271 +60,11 @@
#include "QtCore/qtextcodec.h"
#endif
+#include "private/qstringconverter_p.h"
#include "private/qtextcodec_p.h"
QT_BEGIN_NAMESPACE
-struct QUtf8BaseTraits
-{
- static const bool isTrusted = false;
- static const bool allowNonCharacters = true;
- static const bool skipAsciiHandling = false;
- static const int Error = -1;
- static const int EndOfString = -2;
-
- static bool isValidCharacter(uint u)
- { return int(u) >= 0; }
-
- static void appendByte(uchar *&ptr, uchar b)
- { *ptr++ = b; }
-
- static uchar peekByte(const uchar *ptr, int n = 0)
- { return ptr[n]; }
-
- static qptrdiff availableBytes(const uchar *ptr, const uchar *end)
- { return end - ptr; }
-
- static void advanceByte(const uchar *&ptr, int n = 1)
- { ptr += n; }
-
- static void appendUtf16(ushort *&ptr, ushort uc)
- { *ptr++ = uc; }
-
- static void appendUcs4(ushort *&ptr, uint uc)
- {
- appendUtf16(ptr, QChar::highSurrogate(uc));
- appendUtf16(ptr, QChar::lowSurrogate(uc));
- }
-
- static ushort peekUtf16(const ushort *ptr, int n = 0)
- { return ptr[n]; }
-
- static qptrdiff availableUtf16(const ushort *ptr, const ushort *end)
- { return end - ptr; }
-
- static void advanceUtf16(const ushort *&ptr, int n = 1)
- { ptr += n; }
-
- // it's possible to output to UCS-4 too
- static void appendUtf16(uint *&ptr, ushort uc)
- { *ptr++ = uc; }
-
- static void appendUcs4(uint *&ptr, uint uc)
- { *ptr++ = uc; }
-};
-
-struct QUtf8BaseTraitsNoAscii : public QUtf8BaseTraits
-{
- static const bool skipAsciiHandling = true;
-};
-
-namespace QUtf8Functions
-{
- /// returns 0 on success; errors can only happen if \a u is a surrogate:
- /// Error if \a u is a low surrogate;
- /// if \a u is a high surrogate, Error if the next isn't a low one,
- /// EndOfString if we run into the end of the string.
- template <typename Traits, typename OutputPtr, typename InputPtr> inline
- int toUtf8(ushort u, OutputPtr &dst, InputPtr &src, InputPtr end)
- {
- if (!Traits::skipAsciiHandling && u < 0x80) {
- // U+0000 to U+007F (US-ASCII) - one byte
- Traits::appendByte(dst, uchar(u));
- return 0;
- } else if (u < 0x0800) {
- // U+0080 to U+07FF - two bytes
- // first of two bytes
- Traits::appendByte(dst, 0xc0 | uchar(u >> 6));
- } else {
- if (!QChar::isSurrogate(u)) {
- // U+0800 to U+FFFF (except U+D800-U+DFFF) - three bytes
- if (!Traits::allowNonCharacters && QChar::isNonCharacter(u))
- return Traits::Error;
-
- // first of three bytes
- Traits::appendByte(dst, 0xe0 | uchar(u >> 12));
- } else {
- // U+10000 to U+10FFFF - four bytes
- // need to get one extra codepoint
- if (Traits::availableUtf16(src, end) == 0)
- return Traits::EndOfString;
-
- ushort low = Traits::peekUtf16(src);
- if (!QChar::isHighSurrogate(u))
- return Traits::Error;
- if (!QChar::isLowSurrogate(low))
- return Traits::Error;
-
- Traits::advanceUtf16(src);
- uint ucs4 = QChar::surrogateToUcs4(u, low);
-
- if (!Traits::allowNonCharacters && QChar::isNonCharacter(ucs4))
- return Traits::Error;
-
- // first byte
- Traits::appendByte(dst, 0xf0 | (uchar(ucs4 >> 18) & 0xf));
-
- // second of four bytes
- Traits::appendByte(dst, 0x80 | (uchar(ucs4 >> 12) & 0x3f));
-
- // for the rest of the bytes
- u = ushort(ucs4);
- }
-
- // second to last byte
- Traits::appendByte(dst, 0x80 | (uchar(u >> 6) & 0x3f));
- }
-
- // last byte
- Traits::appendByte(dst, 0x80 | (u & 0x3f));
- return 0;
- }
-
- inline bool isContinuationByte(uchar b)
- {
- return (b & 0xc0) == 0x80;
- }
-
- /// returns the number of characters consumed (including \a b) in case of success;
- /// returns negative in case of error: Traits::Error or Traits::EndOfString
- template <typename Traits, typename OutputPtr, typename InputPtr> inline
- int fromUtf8(uchar b, OutputPtr &dst, InputPtr &src, InputPtr end)
- {
- int charsNeeded;
- uint min_uc;
- uint uc;
-
- if (!Traits::skipAsciiHandling && b < 0x80) {
- // US-ASCII
- Traits::appendUtf16(dst, b);
- return 1;
- }
-
- if (!Traits::isTrusted && Q_UNLIKELY(b <= 0xC1)) {
- // an UTF-8 first character must be at least 0xC0
- // however, all 0xC0 and 0xC1 first bytes can only produce overlong sequences
- return Traits::Error;
- } else if (b < 0xe0) {
- charsNeeded = 2;
- min_uc = 0x80;
- uc = b & 0x1f;
- } else if (b < 0xf0) {
- charsNeeded = 3;
- min_uc = 0x800;
- uc = b & 0x0f;
- } else if (b < 0xf5) {
- charsNeeded = 4;
- min_uc = 0x10000;
- uc = b & 0x07;
- } else {
- // the last Unicode character is U+10FFFF
- // it's encoded in UTF-8 as "\xF4\x8F\xBF\xBF"
- // therefore, a byte higher than 0xF4 is not the UTF-8 first byte
- return Traits::Error;
- }
-
- int bytesAvailable = Traits::availableBytes(src, end);
- if (Q_UNLIKELY(bytesAvailable < charsNeeded - 1)) {
- // it's possible that we have an error instead of just unfinished bytes
- if (bytesAvailable > 0 && !isContinuationByte(Traits::peekByte(src, 0)))
- return Traits::Error;
- if (bytesAvailable > 1 && !isContinuationByte(Traits::peekByte(src, 1)))
- return Traits::Error;
- return Traits::EndOfString;
- }
-
- // first continuation character
- b = Traits::peekByte(src, 0);
- if (!isContinuationByte(b))
- return Traits::Error;
- uc <<= 6;
- uc |= b & 0x3f;
-
- if (charsNeeded > 2) {
- // second continuation character
- b = Traits::peekByte(src, 1);
- if (!isContinuationByte(b))
- return Traits::Error;
- uc <<= 6;
- uc |= b & 0x3f;
-
- if (charsNeeded > 3) {
- // third continuation character
- b = Traits::peekByte(src, 2);
- if (!isContinuationByte(b))
- return Traits::Error;
- uc <<= 6;
- uc |= b & 0x3f;
- }
- }
-
- // we've decoded something; safety-check it
- if (!Traits::isTrusted) {
- if (uc < min_uc)
- return Traits::Error;
- if (QChar::isSurrogate(uc) || uc > QChar::LastValidCodePoint)
- return Traits::Error;
- if (!Traits::allowNonCharacters && QChar::isNonCharacter(uc))
- return Traits::Error;
- }
-
- // write the UTF-16 sequence
- if (!QChar::requiresSurrogates(uc)) {
- // UTF-8 decoded and no surrogates are required
- // detach if necessary
- Traits::appendUtf16(dst, ushort(uc));
- } else {
- // UTF-8 decoded to something that requires a surrogate pair
- Traits::appendUcs4(dst, uc);
- }
-
- Traits::advanceByte(src, charsNeeded - 1);
- return charsNeeded;
- }
-}
-
-enum DataEndianness
-{
- DetectEndianness,
- BigEndianness,
- LittleEndianness
-};
-
-struct QUtf8
-{
- static QChar *convertToUnicode(QChar *, const char *, int) noexcept;
- static QString convertToUnicode(const char *, int);
- static QString convertToUnicode(const char *, int, QTextCodec::ConverterState *);
- static QByteArray convertFromUnicode(const QChar *, int);
- static QByteArray convertFromUnicode(const QChar *, int, QTextCodec::ConverterState *);
- struct ValidUtf8Result {
- bool isValidUtf8;
- bool isValidAscii;
- };
- static ValidUtf8Result isValidUtf8(const char *, qsizetype);
- static int compareUtf8(const char *, qsizetype, const QChar *, int);
- static int compareUtf8(const char *, qsizetype, QLatin1String s);
-};
-
-struct QUtf16
-{
- static QString convertToUnicode(const char *, int, QTextCodec::ConverterState *, DataEndianness = DetectEndianness);
- static QByteArray convertFromUnicode(const QChar *, int, QTextCodec::ConverterState *, DataEndianness = DetectEndianness);
-};
-
-struct QUtf32
-{
- static QString convertToUnicode(const char *, int, QTextCodec::ConverterState *, DataEndianness = DetectEndianness);
- static QByteArray convertFromUnicode(const QChar *, int, QTextCodec::ConverterState *, DataEndianness = DetectEndianness);
-};
-
-/*
- Converts from different utf encodings looking at a possible byte order mark at the
- beginning of the string. If no BOM exists, utf-8 is assumed.
- */
-QString Q_CORE_EXPORT qFromUtfEncoded(const QByteArray &ba);
-
-
#if QT_CONFIG(textcodec)
class QUtf8Codec : public QTextCodec {
diff --git a/src/corelib/global/qconfig-bootstrapped.h b/src/corelib/global/qconfig-bootstrapped.h
index 349dfeea1c..6ef4acf503 100644
--- a/src/corelib/global/qconfig-bootstrapped.h
+++ b/src/corelib/global/qconfig-bootstrapped.h
@@ -141,18 +141,17 @@
#define QT_FEATURE_zstd -1
#endif
+#define QT_FEATURE_textcodec -1
+
#ifdef QT_BUILD_QMAKE
#define QT_FEATURE_commandlineparser -1
#define QT_NO_COMPRESS
#define QT_JSON_READONLY
#define QT_FEATURE_settings 1
#define QT_NO_STANDARDPATHS
-#define QT_FEATURE_textcodec -1
#else
-#define QT_FEATURE_codecs -1
#define QT_FEATURE_commandlineparser 1
#define QT_FEATURE_settings -1
-#define QT_FEATURE_textcodec 1
#endif
#endif // QT_BOOTSTRAPPED
diff --git a/src/corelib/io/qfilesystemiterator_unix.cpp b/src/corelib/io/qfilesystemiterator_unix.cpp
index ceea3a467c..4bc6b2e31b 100644
--- a/src/corelib/io/qfilesystemiterator_unix.cpp
+++ b/src/corelib/io/qfilesystemiterator_unix.cpp
@@ -42,7 +42,7 @@
#if QT_CONFIG(textcodec)
# include <qtextcodec.h>
-# include <private/qutfcodec_p.h>
+# include <private/qstringconverter_p.h>
#endif
#ifndef QT_NO_FILESYSTEMITERATOR
diff --git a/src/corelib/io/qurlrecode.cpp b/src/corelib/io/qurlrecode.cpp
index 1c9d0d1d4b..2788de3b3a 100644
--- a/src/corelib/io/qurlrecode.cpp
+++ b/src/corelib/io/qurlrecode.cpp
@@ -38,7 +38,7 @@
****************************************************************************/
#include "qurl.h"
-#include "private/qutfcodec_p.h"
+#include "private/qstringconverter_p.h"
#include "private/qtools_p.h"
#include "private/qsimd_p.h"
diff --git a/src/corelib/serialization/qcborstreamreader.cpp b/src/corelib/serialization/qcborstreamreader.cpp
index ec385e0629..c49a76aada 100644
--- a/src/corelib/serialization/qcborstreamreader.cpp
+++ b/src/corelib/serialization/qcborstreamreader.cpp
@@ -44,7 +44,7 @@
#include <private/qbytearray_p.h>
#include <private/qnumeric_p.h>
-#include <private/qutfcodec_p.h>
+#include <private/qstringconverter_p.h>
#include <qdebug.h>
#include <qstack.h>
diff --git a/src/corelib/serialization/qcborvalue_p.h b/src/corelib/serialization/qcborvalue_p.h
index 1d686f118b..38383c7522 100644
--- a/src/corelib/serialization/qcborvalue_p.h
+++ b/src/corelib/serialization/qcborvalue_p.h
@@ -54,7 +54,7 @@
#include "qcborvalue.h"
#include <private/qglobal_p.h>
-#include <private/qutfcodec_p.h>
+#include <private/qstringconverter_p.h>
#include <math.h>
diff --git a/src/corelib/serialization/qjsonparser.cpp b/src/corelib/serialization/qjsonparser.cpp
index 46d82ea47f..116e7f6995 100644
--- a/src/corelib/serialization/qjsonparser.cpp
+++ b/src/corelib/serialization/qjsonparser.cpp
@@ -44,7 +44,7 @@
#include <qdebug.h>
#include "qjsonparser_p.h"
#include "qjson_p.h"
-#include "private/qutfcodec_p.h"
+#include "private/qstringconverter_p.h"
#include "private/qcborvalue_p.h"
#include "private/qnumeric_p.h"
diff --git a/src/corelib/serialization/qjsonwriter.cpp b/src/corelib/serialization/qjsonwriter.cpp
index 590b59f09c..8610cdff7e 100644
--- a/src/corelib/serialization/qjsonwriter.cpp
+++ b/src/corelib/serialization/qjsonwriter.cpp
@@ -42,7 +42,7 @@
#include <qlocale.h>
#include "qjsonwriter_p.h"
#include "qjson_p.h"
-#include "private/qutfcodec_p.h"
+#include "private/qstringconverter_p.h"
#include <private/qnumeric_p.h>
#include <private/qcborvalue_p.h>
diff --git a/src/corelib/text/qstring.cpp b/src/corelib/text/qstring.cpp
index 0682395ebf..68a0f757c8 100644
--- a/src/corelib/text/qstring.cpp
+++ b/src/corelib/text/qstring.cpp
@@ -48,7 +48,7 @@
#if QT_CONFIG(textcodec)
#include <qtextcodec.h>
#endif
-#include <private/qutfcodec_p.h>
+#include <private/qstringconverter_p.h>
#include "qlocale_tools_p.h"
#include "private/qsimd_p.h"
#include <qnumeric.h>
diff --git a/src/corelib/text/qstringbuilder.cpp b/src/corelib/text/qstringbuilder.cpp
index 29bd216e80..4e47ba0922 100644
--- a/src/corelib/text/qstringbuilder.cpp
+++ b/src/corelib/text/qstringbuilder.cpp
@@ -38,7 +38,7 @@
****************************************************************************/
#include "qstringbuilder.h"
-#include <private/qutfcodec_p.h>
+#include <private/qstringconverter_p.h>
QT_BEGIN_NAMESPACE
diff --git a/src/corelib/text/qstringconverter.cpp b/src/corelib/text/qstringconverter.cpp
index 1f61eee5cb..36567f5106 100644
--- a/src/corelib/text/qstringconverter.cpp
+++ b/src/corelib/text/qstringconverter.cpp
@@ -39,10 +39,954 @@
****************************************************************************/
#include <qstringconverter.h>
-#include <private/qutfcodec_p.h>
+#include <private/qstringconverter_p.h>
+#include "qendian.h"
+
+#include "private/qsimd_p.h"
+#include "private/qstringiterator_p.h"
QT_BEGIN_NAMESPACE
+enum { Endian = 0, Data = 1 };
+
+static const uchar utf8bom[] = { 0xef, 0xbb, 0xbf };
+
+#if (defined(__SSE2__) && defined(QT_COMPILER_SUPPORTS_SSE2)) \
+ || (defined(__ARM_NEON__) && defined(Q_PROCESSOR_ARM_64))
+static Q_ALWAYS_INLINE uint qBitScanReverse(unsigned v) noexcept
+{
+ uint result = qCountLeadingZeroBits(v);
+ // Now Invert the result: clz will count *down* from the msb to the lsb, so the msb index is 31
+ // and the lsb index is 0. The result for _bit_scan_reverse is expected to be the index when
+ // counting up: msb index is 0 (because it starts there), and the lsb index is 31.
+ result ^= sizeof(unsigned) * 8 - 1;
+ return result;
+}
+#endif
+
+#if defined(__SSE2__) && defined(QT_COMPILER_SUPPORTS_SSE2)
+static inline bool simdEncodeAscii(uchar *&dst, const ushort *&nextAscii, const ushort *&src, const ushort *end)
+{
+ // do sixteen characters at a time
+ for ( ; end - src >= 16; src += 16, dst += 16) {
+# ifdef __AVX2__
+ __m256i data = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src));
+ __m128i data1 = _mm256_castsi256_si128(data);
+ __m128i data2 = _mm256_extracti128_si256(data, 1);
+# else
+ __m128i data1 = _mm_loadu_si128((const __m128i*)src);
+ __m128i data2 = _mm_loadu_si128(1+(const __m128i*)src);
+# endif
+
+ // check if everything is ASCII
+ // the highest ASCII value is U+007F
+ // Do the packing directly:
+ // The PACKUSWB instruction has packs a signed 16-bit integer to an unsigned 8-bit
+ // with saturation. That is, anything from 0x0100 to 0x7fff is saturated to 0xff,
+ // while all negatives (0x8000 to 0xffff) get saturated to 0x00. To detect non-ASCII,
+ // we simply do a signed greater-than comparison to 0x00. That means we detect NULs as
+ // "non-ASCII", but it's an acceptable compromise.
+ __m128i packed = _mm_packus_epi16(data1, data2);
+ __m128i nonAscii = _mm_cmpgt_epi8(packed, _mm_setzero_si128());
+
+ // store, even if there are non-ASCII characters here
+ _mm_storeu_si128((__m128i*)dst, packed);
+
+ // n will contain 1 bit set per character in [data1, data2] that is non-ASCII (or NUL)
+ ushort n = ~_mm_movemask_epi8(nonAscii);
+ if (n) {
+ // find the next probable ASCII character
+ // we don't want to load 32 bytes again in this loop if we know there are non-ASCII
+ // characters still coming
+ nextAscii = src + qBitScanReverse(n) + 1;
+
+ n = qCountTrailingZeroBits(n);
+ dst += n;
+ src += n;
+ return false;
+ }
+ }
+
+ if (end - src >= 8) {
+ // do eight characters at a time
+ __m128i data = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src));
+ __m128i packed = _mm_packus_epi16(data, data);
+ __m128i nonAscii = _mm_cmpgt_epi8(packed, _mm_setzero_si128());
+
+ // store even non-ASCII
+ _mm_storel_epi64(reinterpret_cast<__m128i *>(dst), packed);
+
+ uchar n = ~_mm_movemask_epi8(nonAscii);
+ if (n) {
+ nextAscii = src + qBitScanReverse(n) + 1;
+ n = qCountTrailingZeroBits(n);
+ dst += n;
+ src += n;
+ return false;
+ }
+ }
+
+ return src == end;
+}
+
+static inline bool simdDecodeAscii(ushort *&dst, const uchar *&nextAscii, const uchar *&src, const uchar *end)
+{
+ // do sixteen characters at a time
+ for ( ; end - src >= 16; src += 16, dst += 16) {
+ __m128i data = _mm_loadu_si128((const __m128i*)src);
+
+#ifdef __AVX2__
+ const int BitSpacing = 2;
+ // load and zero extend to an YMM register
+ const __m256i extended = _mm256_cvtepu8_epi16(data);
+
+ uint n = _mm256_movemask_epi8(extended);
+ if (!n) {
+ // store
+ _mm256_storeu_si256((__m256i*)dst, extended);
+ continue;
+ }
+#else
+ const int BitSpacing = 1;
+
+ // check if everything is ASCII
+ // movemask extracts the high bit of every byte, so n is non-zero if something isn't ASCII
+ uint n = _mm_movemask_epi8(data);
+ if (!n) {
+ // unpack
+ _mm_storeu_si128((__m128i*)dst, _mm_unpacklo_epi8(data, _mm_setzero_si128()));
+ _mm_storeu_si128(1+(__m128i*)dst, _mm_unpackhi_epi8(data, _mm_setzero_si128()));
+ continue;
+ }
+#endif
+
+ // copy the front part that is still ASCII
+ while (!(n & 1)) {
+ *dst++ = *src++;
+ n >>= BitSpacing;
+ }
+
+ // find the next probable ASCII character
+ // we don't want to load 16 bytes again in this loop if we know there are non-ASCII
+ // characters still coming
+ n = qBitScanReverse(n);
+ nextAscii = src + (n / BitSpacing) + 1;
+ return false;
+
+ }
+
+ if (end - src >= 8) {
+ __m128i data = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(src));
+ uint n = _mm_movemask_epi8(data) & 0xff;
+ if (!n) {
+ // unpack and store
+ _mm_storeu_si128(reinterpret_cast<__m128i *>(dst), _mm_unpacklo_epi8(data, _mm_setzero_si128()));
+ } else {
+ while (!(n & 1)) {
+ *dst++ = *src++;
+ n >>= 1;
+ }
+
+ n = qBitScanReverse(n);
+ nextAscii = src + n + 1;
+ return false;
+ }
+ }
+
+ return src == end;
+}
+
+static inline const uchar *simdFindNonAscii(const uchar *src, const uchar *end, const uchar *&nextAscii)
+{
+#ifdef __AVX2__
+ // do 32 characters at a time
+ // (this is similar to simdTestMask in qstring.cpp)
+ const __m256i mask = _mm256_set1_epi8(0x80);
+ for ( ; end - src >= 32; src += 32) {
+ __m256i data = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src));
+ if (_mm256_testz_si256(mask, data))
+ continue;
+
+ uint n = _mm256_movemask_epi8(data);
+ Q_ASSUME(n);
+
+ // find the next probable ASCII character
+ // we don't want to load 32 bytes again in this loop if we know there are non-ASCII
+ // characters still coming
+ nextAscii = src + qBitScanReverse(n) + 1;
+
+ // return the non-ASCII character
+ return src + qCountTrailingZeroBits(n);
+ }
+#endif
+
+ // do sixteen characters at a time
+ for ( ; end - src >= 16; src += 16) {
+ __m128i data = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src));
+
+ // check if everything is ASCII
+ // movemask extracts the high bit of every byte, so n is non-zero if something isn't ASCII
+ uint n = _mm_movemask_epi8(data);
+ if (!n)
+ continue;
+
+ // find the next probable ASCII character
+ // we don't want to load 16 bytes again in this loop if we know there are non-ASCII
+ // characters still coming
+ nextAscii = src + qBitScanReverse(n) + 1;
+
+ // return the non-ASCII character
+ return src + qCountTrailingZeroBits(n);
+ }
+
+ // do four characters at a time
+ for ( ; end - src >= 4; src += 4) {
+ quint32 data = qFromUnaligned<quint32>(src);
+ data &= 0x80808080U;
+ if (!data)
+ continue;
+
+ // We don't try to guess which of the three bytes is ASCII and which
+ // one isn't. The chance that at least two of them are non-ASCII is
+ // better than 75%.
+ nextAscii = src;
+ return src;
+ }
+ nextAscii = end;
+ return src;
+}
+#elif defined(__ARM_NEON__) && defined(Q_PROCESSOR_ARM_64) // vaddv is only available on Aarch64
+static inline bool simdEncodeAscii(uchar *&dst, const ushort *&nextAscii, const ushort *&src, const ushort *end)
+{
+ uint16x8_t maxAscii = vdupq_n_u16(0x7f);
+ uint16x8_t mask1 = { 1, 1 << 2, 1 << 4, 1 << 6, 1 << 8, 1 << 10, 1 << 12, 1 << 14 };
+ uint16x8_t mask2 = vshlq_n_u16(mask1, 1);
+
+ // do sixteen characters at a time
+ for ( ; end - src >= 16; src += 16, dst += 16) {
+ // load 2 lanes (or: "load interleaved")
+ uint16x8x2_t in = vld2q_u16(src);
+
+ // check if any of the elements > 0x7f, select 1 bit per element (element 0 -> bit 0, element 1 -> bit 1, etc),
+ // add those together into a scalar, and merge the scalars.
+ uint16_t nonAscii = vaddvq_u16(vandq_u16(vcgtq_u16(in.val[0], maxAscii), mask1))
+ | vaddvq_u16(vandq_u16(vcgtq_u16(in.val[1], maxAscii), mask2));
+
+ // merge the two lanes by shifting the values of the second by 8 and inserting them
+ uint16x8_t out = vsliq_n_u16(in.val[0], in.val[1], 8);
+
+ // store, even if there are non-ASCII characters here
+ vst1q_u8(dst, vreinterpretq_u8_u16(out));
+
+ if (nonAscii) {
+ // find the next probable ASCII character
+ // we don't want to load 32 bytes again in this loop if we know there are non-ASCII
+ // characters still coming
+ nextAscii = src + qBitScanReverse(nonAscii) + 1;
+
+ nonAscii = qCountTrailingZeroBits(nonAscii);
+ dst += nonAscii;
+ src += nonAscii;
+ return false;
+ }
+ }
+ return src == end;
+}
+
+static inline bool simdDecodeAscii(ushort *&dst, const uchar *&nextAscii, const uchar *&src, const uchar *end)
+{
+ // do eight characters at a time
+ uint8x8_t msb_mask = vdup_n_u8(0x80);
+ uint8x8_t add_mask = { 1, 1 << 1, 1 << 2, 1 << 3, 1 << 4, 1 << 5, 1 << 6, 1 << 7 };
+ for ( ; end - src >= 8; src += 8, dst += 8) {
+ uint8x8_t c = vld1_u8(src);
+ uint8_t n = vaddv_u8(vand_u8(vcge_u8(c, msb_mask), add_mask));
+ if (!n) {
+ // store
+ vst1q_u16(dst, vmovl_u8(c));
+ continue;
+ }
+
+ // copy the front part that is still ASCII
+ while (!(n & 1)) {
+ *dst++ = *src++;
+ n >>= 1;
+ }
+
+ // find the next probable ASCII character
+ // we don't want to load 16 bytes again in this loop if we know there are non-ASCII
+ // characters still coming
+ n = qBitScanReverse(n);
+ nextAscii = src + n + 1;
+ return false;
+
+ }
+ return src == end;
+}
+
+static inline const uchar *simdFindNonAscii(const uchar *src, const uchar *end, const uchar *&nextAscii)
+{
+ // The SIMD code below is untested, so just force an early return until
+ // we've had the time to verify it works.
+ nextAscii = end;
+ return src;
+
+ // do eight characters at a time
+ uint8x8_t msb_mask = vdup_n_u8(0x80);
+ uint8x8_t add_mask = { 1, 1 << 1, 1 << 2, 1 << 3, 1 << 4, 1 << 5, 1 << 6, 1 << 7 };
+ for ( ; end - src >= 8; src += 8) {
+ uint8x8_t c = vld1_u8(src);
+ uint8_t n = vaddv_u8(vand_u8(vcge_u8(c, msb_mask), add_mask));
+ if (!n)
+ continue;
+
+ // find the next probable ASCII character
+ // we don't want to load 16 bytes again in this loop if we know there are non-ASCII
+ // characters still coming
+ nextAscii = src + qBitScanReverse(n) + 1;
+
+ // return the non-ASCII character
+ return src + qCountTrailingZeroBits(n);
+ }
+ nextAscii = end;
+ return src;
+}
+#else
+static inline bool simdEncodeAscii(uchar *, const ushort *, const ushort *, const ushort *)
+{
+ return false;
+}
+
+static inline bool simdDecodeAscii(ushort *, const uchar *, const uchar *, const uchar *)
+{
+ return false;
+}
+
+static inline const uchar *simdFindNonAscii(const uchar *src, const uchar *end, const uchar *&nextAscii)
+{
+ nextAscii = end;
+ return src;
+}
+#endif
+
+QByteArray QUtf8::convertFromUnicode(const QChar *uc, qsizetype len)
+{
+ // create a QByteArray with the worst case scenario size
+ QByteArray result(len * 3, Qt::Uninitialized);
+ uchar *dst = reinterpret_cast<uchar *>(const_cast<char *>(result.constData()));
+ const ushort *src = reinterpret_cast<const ushort *>(uc);
+ const ushort *const end = src + len;
+
+ while (src != end) {
+ const ushort *nextAscii = end;
+ if (simdEncodeAscii(dst, nextAscii, src, end))
+ break;
+
+ do {
+ ushort uc = *src++;
+ int res = QUtf8Functions::toUtf8<QUtf8BaseTraits>(uc, dst, src, end);
+ if (res < 0) {
+ // encoding error - append '?'
+ *dst++ = '?';
+ }
+ } while (src < nextAscii);
+ }
+
+ result.truncate(dst - reinterpret_cast<uchar *>(const_cast<char *>(result.constData())));
+ return result;
+}
+
+QByteArray QUtf8::convertFromUnicode(const QChar *uc, qsizetype len, QStringConverter::State *state)
+{
+ uchar replacement = '?';
+ qsizetype rlen = 3*len;
+ int surrogate_high = -1;
+ if (state) {
+ if (state->flags & QStringConverter::ConvertInvalidToNull)
+ replacement = 0;
+ if (!(state->flags & QStringConverter::IgnoreHeader))
+ rlen += 3;
+ if (state->remainingChars)
+ surrogate_high = state->state_data[0];
+ }
+
+
+ QByteArray rstr(rlen, Qt::Uninitialized);
+ uchar *cursor = reinterpret_cast<uchar *>(const_cast<char *>(rstr.constData()));
+ const ushort *src = reinterpret_cast<const ushort *>(uc);
+ const ushort *const end = src + len;
+
+ int invalid = 0;
+ if (state && !(state->flags & QStringConverter::IgnoreHeader)) {
+ // append UTF-8 BOM
+ *cursor++ = utf8bom[0];
+ *cursor++ = utf8bom[1];
+ *cursor++ = utf8bom[2];
+ }
+
+ const ushort *nextAscii = src;
+ while (src != end) {
+ int res;
+ ushort uc;
+ if (surrogate_high != -1) {
+ uc = surrogate_high;
+ surrogate_high = -1;
+ res = QUtf8Functions::toUtf8<QUtf8BaseTraits>(uc, cursor, src, end);
+ } else {
+ if (src >= nextAscii && simdEncodeAscii(cursor, nextAscii, src, end))
+ break;
+
+ uc = *src++;
+ res = QUtf8Functions::toUtf8<QUtf8BaseTraits>(uc, cursor, src, end);
+ }
+ if (Q_LIKELY(res >= 0))
+ continue;
+
+ if (res == QUtf8BaseTraits::Error) {
+ // encoding error
+ ++invalid;
+ *cursor++ = replacement;
+ } else if (res == QUtf8BaseTraits::EndOfString) {
+ surrogate_high = uc;
+ break;
+ }
+ }
+
+ rstr.resize(cursor - (const uchar*)rstr.constData());
+ if (state) {
+ state->invalidChars += invalid;
+ state->flags |= QStringConverter::IgnoreHeader;
+ state->remainingChars = 0;
+ if (surrogate_high >= 0) {
+ state->remainingChars = 1;
+ state->state_data[0] = surrogate_high;
+ }
+ }
+ return rstr;
+}
+
+QString QUtf8::convertToUnicode(const char *chars, qsizetype len)
+{
+ // UTF-8 to UTF-16 always needs the exact same number of words or less:
+ // UTF-8 UTF-16
+ // 1 byte 1 word
+ // 2 bytes 1 word
+ // 3 bytes 1 word
+ // 4 bytes 2 words (one surrogate pair)
+ // That is, we'll use the full buffer if the input is US-ASCII (1-byte UTF-8),
+ // half the buffer for U+0080-U+07FF text (e.g., Greek, Cyrillic, Arabic) or
+ // non-BMP text, and one third of the buffer for U+0800-U+FFFF text (e.g, CJK).
+ //
+ // The table holds for invalid sequences too: we'll insert one replacement char
+ // per invalid byte.
+ QString result(len, Qt::Uninitialized);
+ QChar *data = const_cast<QChar*>(result.constData()); // we know we're not shared
+ const QChar *end = convertToUnicode(data, chars, len);
+ result.truncate(end - data);
+ return result;
+}
+
+/*!
+ \since 5.7
+ \overload
+
+ Converts the UTF-8 sequence of \a len octets beginning at \a chars to
+ a sequence of QChar starting at \a buffer. The buffer is expected to be
+ large enough to hold the result. An upper bound for the size of the
+ buffer is \a len QChars.
+
+ If, during decoding, an error occurs, a QChar::ReplacementCharacter is
+ written.
+
+ Returns a pointer to one past the last QChar written.
+
+ This function never throws.
+*/
+
+QChar *QUtf8::convertToUnicode(QChar *buffer, const char *chars, qsizetype len) noexcept
+{
+ ushort *dst = reinterpret_cast<ushort *>(buffer);
+ const uchar *src = reinterpret_cast<const uchar *>(chars);
+ const uchar *end = src + len;
+
+ // attempt to do a full decoding in SIMD
+ const uchar *nextAscii = end;
+ if (!simdDecodeAscii(dst, nextAscii, src, end)) {
+ // at least one non-ASCII entry
+ // check if we failed to decode the UTF-8 BOM; if so, skip it
+ if (Q_UNLIKELY(src == reinterpret_cast<const uchar *>(chars))
+ && end - src >= 3
+ && Q_UNLIKELY(src[0] == utf8bom[0] && src[1] == utf8bom[1] && src[2] == utf8bom[2])) {
+ src += 3;
+ }
+
+ while (src < end) {
+ nextAscii = end;
+ if (simdDecodeAscii(dst, nextAscii, src, end))
+ break;
+
+ do {
+ uchar b = *src++;
+ int res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(b, dst, src, end);
+ if (res < 0) {
+ // decoding error
+ *dst++ = QChar::ReplacementCharacter;
+ }
+ } while (src < nextAscii);
+ }
+ }
+
+ return reinterpret_cast<QChar *>(dst);
+}
+
+QString QUtf8::convertToUnicode(const char *chars, qsizetype len, QStringConverter::State *state)
+{
+ bool headerdone = false;
+ ushort replacement = QChar::ReplacementCharacter;
+ int invalid = 0;
+ int res;
+ uchar ch = 0;
+
+ // See above for buffer requirements for stateless decoding. However, that
+ // fails if the state is not empty. The following situations can add to the
+ // requirements:
+ // state contains chars starts with requirement
+ // 1 of 2 bytes valid continuation 0
+ // 2 of 3 bytes same 0
+ // 3 bytes of 4 same +1 (need to insert surrogate pair)
+ // 1 of 2 bytes invalid continuation +1 (need to insert replacement and restart)
+ // 2 of 3 bytes same +1 (same)
+ // 3 of 4 bytes same +1 (same)
+ QString result(len + 1, Qt::Uninitialized);
+
+ ushort *dst = reinterpret_cast<ushort *>(const_cast<QChar *>(result.constData()));
+ const uchar *src = reinterpret_cast<const uchar *>(chars);
+ const uchar *end = src + len;
+
+ if (state) {
+ if (state->flags & QStringConverter::IgnoreHeader)
+ headerdone = true;
+ if (state->flags & QStringConverter::ConvertInvalidToNull)
+ replacement = QChar::Null;
+ if (state->remainingChars) {
+ // handle incoming state first
+ uchar remainingCharsData[4]; // longest UTF-8 sequence possible
+ qsizetype remainingCharsCount = state->remainingChars;
+ qsizetype newCharsToCopy = qMin<int>(sizeof(remainingCharsData) - remainingCharsCount, end - src);
+
+ memset(remainingCharsData, 0, sizeof(remainingCharsData));
+ memcpy(remainingCharsData, &state->state_data[0], remainingCharsCount);
+ memcpy(remainingCharsData + remainingCharsCount, src, newCharsToCopy);
+
+ const uchar *begin = &remainingCharsData[1];
+ res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(remainingCharsData[0], dst, begin,
+ static_cast<const uchar *>(remainingCharsData) + remainingCharsCount + newCharsToCopy);
+ if (res == QUtf8BaseTraits::Error || (res == QUtf8BaseTraits::EndOfString && len == 0)) {
+ // special case for len == 0:
+ // if we were supplied an empty string, terminate the previous, unfinished sequence with error
+ ++invalid;
+ *dst++ = replacement;
+ } else if (res == QUtf8BaseTraits::EndOfString) {
+ // if we got EndOfString again, then there were too few bytes in src;
+ // copy to our state and return
+ state->remainingChars = remainingCharsCount + newCharsToCopy;
+ memcpy(&state->state_data[0], remainingCharsData, state->remainingChars);
+ return QString();
+ } else if (!headerdone && res >= 0) {
+ // eat the UTF-8 BOM
+ headerdone = true;
+ if (dst[-1] == 0xfeff)
+ --dst;
+ }
+
+ // adjust src now that we have maybe consumed a few chars
+ if (res >= 0) {
+ Q_ASSERT(res > remainingCharsCount);
+ src += res - remainingCharsCount;
+ }
+ }
+ }
+
+ // main body, stateless decoding
+ res = 0;
+ const uchar *nextAscii = src;
+ const uchar *start = src;
+ while (res >= 0 && src < end) {
+ if (src >= nextAscii && simdDecodeAscii(dst, nextAscii, src, end))
+ break;
+
+ ch = *src++;
+ res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(ch, dst, src, end);
+ if (!headerdone && res >= 0) {
+ headerdone = true;
+ if (src == start + 3) { // 3 == sizeof(utf8-bom)
+ // eat the UTF-8 BOM (it can only appear at the beginning of the string).
+ if (dst[-1] == 0xfeff)
+ --dst;
+ }
+ }
+ if (res == QUtf8BaseTraits::Error) {
+ res = 0;
+ ++invalid;
+ *dst++ = replacement;
+ }
+ }
+
+ if (!state && res == QUtf8BaseTraits::EndOfString) {
+ // unterminated UTF sequence
+ *dst++ = QChar::ReplacementCharacter;
+ while (src++ < end)
+ *dst++ = QChar::ReplacementCharacter;
+ }
+
+ result.truncate(dst - (const ushort *)result.unicode());
+ if (state) {
+ state->invalidChars += invalid;
+ if (headerdone)
+ state->flags |= QStringConverter::IgnoreHeader;
+ if (res == QUtf8BaseTraits::EndOfString) {
+ --src; // unread the byte in ch
+ state->remainingChars = end - src;
+ memcpy(&state->state_data[0], src, end - src);
+ } else {
+ state->remainingChars = 0;
+ }
+ }
+ return result;
+}
+
+struct QUtf8NoOutputTraits : public QUtf8BaseTraitsNoAscii
+{
+ struct NoOutput {};
+ static void appendUtf16(const NoOutput &, ushort) {}
+ static void appendUcs4(const NoOutput &, uint) {}
+};
+
+QUtf8::ValidUtf8Result QUtf8::isValidUtf8(const char *chars, qsizetype len)
+{
+ const uchar *src = reinterpret_cast<const uchar *>(chars);
+ const uchar *end = src + len;
+ const uchar *nextAscii = src;
+ bool isValidAscii = true;
+
+ while (src < end) {
+ if (src >= nextAscii)
+ src = simdFindNonAscii(src, end, nextAscii);
+ if (src == end)
+ break;
+
+ do {
+ uchar b = *src++;
+ if ((b & 0x80) == 0)
+ continue;
+
+ isValidAscii = false;
+ QUtf8NoOutputTraits::NoOutput output;
+ int res = QUtf8Functions::fromUtf8<QUtf8NoOutputTraits>(b, output, src, end);
+ if (res < 0) {
+ // decoding error
+ return { false, false };
+ }
+ } while (src < nextAscii);
+ }
+
+ return { true, isValidAscii };
+}
+
+int QUtf8::compareUtf8(const char *utf8, qsizetype u8len, const QChar *utf16, qsizetype u16len)
+{
+ uint uc1, uc2;
+ auto src1 = reinterpret_cast<const uchar *>(utf8);
+ auto end1 = src1 + u8len;
+ QStringIterator src2(utf16, utf16 + u16len);
+
+ while (src1 < end1 && src2.hasNext()) {
+ uchar b = *src1++;
+ uint *output = &uc1;
+ int res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(b, output, src1, end1);
+ if (res < 0) {
+ // decoding error
+ uc1 = QChar::ReplacementCharacter;
+ }
+
+ uc2 = src2.next();
+ if (uc1 != uc2)
+ return int(uc1) - int(uc2);
+ }
+
+ // the shorter string sorts first
+ return (end1 > src1) - int(src2.hasNext());
+}
+
+int QUtf8::compareUtf8(const char *utf8, qsizetype u8len, QLatin1String s)
+{
+ uint uc1;
+ auto src1 = reinterpret_cast<const uchar *>(utf8);
+ auto end1 = src1 + u8len;
+ auto src2 = reinterpret_cast<const uchar *>(s.latin1());
+ auto end2 = src2 + s.size();
+
+ while (src1 < end1 && src2 < end2) {
+ uchar b = *src1++;
+ uint *output = &uc1;
+ int res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(b, output, src1, end1);
+ if (res < 0) {
+ // decoding error
+ uc1 = QChar::ReplacementCharacter;
+ }
+
+ uint uc2 = *src2++;
+ if (uc1 != uc2)
+ return int(uc1) - int(uc2);
+ }
+
+ // the shorter string sorts first
+ return (end1 > src1) - (end2 > src2);
+}
+
+QByteArray QUtf16::convertFromUnicode(const QChar *uc, qsizetype len, QStringConverter::State *state, DataEndianness e)
+{
+ DataEndianness endian = e;
+ qsizetype length = 2*len;
+ if (!state || (!(state->flags & QStringConverter::IgnoreHeader))) {
+ length += 2;
+ }
+ if (e == DetectEndianness) {
+ endian = (QSysInfo::ByteOrder == QSysInfo::BigEndian) ? BigEndianness : LittleEndianness;
+ }
+
+ QByteArray d;
+ d.resize(length);
+ char *data = d.data();
+ if (!state || !(state->flags & QStringConverter::IgnoreHeader)) {
+ QChar bom(QChar::ByteOrderMark);
+ if (endian == BigEndianness)
+ qToBigEndian(bom.unicode(), data);
+ else
+ qToLittleEndian(bom.unicode(), data);
+ data += 2;
+ }
+ if (endian == BigEndianness)
+ qToBigEndian<ushort>(uc, len, data);
+ else
+ qToLittleEndian<ushort>(uc, len, data);
+
+ if (state) {
+ state->remainingChars = 0;
+ state->flags |= QStringConverter::IgnoreHeader;
+ }
+ return d;
+}
+
+QString QUtf16::convertToUnicode(const char *chars, qsizetype len, QStringConverter::State *state, DataEndianness e)
+{
+ DataEndianness endian = e;
+ bool half = false;
+ uchar buf = 0;
+ bool headerdone = false;
+ if (state) {
+ headerdone = state->flags & QStringConverter::IgnoreHeader;
+ if (endian == DetectEndianness)
+ endian = (DataEndianness)state->state_data[Endian];
+ if (state->remainingChars) {
+ half = true;
+ buf = state->state_data[Data];
+ }
+ }
+ if (headerdone && endian == DetectEndianness)
+ endian = (QSysInfo::ByteOrder == QSysInfo::BigEndian) ? BigEndianness : LittleEndianness;
+
+ QString result(len, Qt::Uninitialized); // worst case
+ QChar *qch = (QChar *)result.data();
+ while (len--) {
+ if (half) {
+ QChar ch;
+ if (endian == LittleEndianness) {
+ ch.setRow(*chars++);
+ ch.setCell(buf);
+ } else {
+ ch.setRow(buf);
+ ch.setCell(*chars++);
+ }
+ if (!headerdone) {
+ headerdone = true;
+ if (endian == DetectEndianness) {
+ if (ch == QChar::ByteOrderSwapped) {
+ endian = LittleEndianness;
+ } else if (ch == QChar::ByteOrderMark) {
+ endian = BigEndianness;
+ } else {
+ if (QSysInfo::ByteOrder == QSysInfo::BigEndian) {
+ endian = BigEndianness;
+ } else {
+ endian = LittleEndianness;
+ ch = QChar::fromUcs2((ch.unicode() >> 8) | ((ch.unicode() & 0xff) << 8));
+ }
+ *qch++ = ch;
+ }
+ } else if (ch != QChar::ByteOrderMark) {
+ *qch++ = ch;
+ }
+ } else {
+ *qch++ = ch;
+ }
+ half = false;
+ } else {
+ buf = *chars++;
+ half = true;
+ }
+ }
+ result.truncate(qch - result.unicode());
+
+ if (state) {
+ if (headerdone)
+ state->flags |= QStringConverter::IgnoreHeader;
+ state->state_data[Endian] = endian;
+ if (half) {
+ state->remainingChars = 1;
+ state->state_data[Data] = buf;
+ } else {
+ state->remainingChars = 0;
+ state->state_data[Data] = 0;
+ }
+ }
+ return result;
+}
+
+QByteArray QUtf32::convertFromUnicode(const QChar *uc, qsizetype len, QStringConverter::State *state, DataEndianness e)
+{
+ DataEndianness endian = e;
+ qsizetype length = 4*len;
+ if (!state || (!(state->flags & QStringConverter::IgnoreHeader))) {
+ length += 4;
+ }
+ if (e == DetectEndianness) {
+ endian = (QSysInfo::ByteOrder == QSysInfo::BigEndian) ? BigEndianness : LittleEndianness;
+ }
+
+ QByteArray d(length, Qt::Uninitialized);
+ char *data = d.data();
+ if (!state || !(state->flags & QStringConverter::IgnoreHeader)) {
+ if (endian == BigEndianness) {
+ data[0] = 0;
+ data[1] = 0;
+ data[2] = (char)0xfe;
+ data[3] = (char)0xff;
+ } else {
+ data[0] = (char)0xff;
+ data[1] = (char)0xfe;
+ data[2] = 0;
+ data[3] = 0;
+ }
+ data += 4;
+ }
+
+ QStringIterator i(uc, uc + len);
+ if (endian == BigEndianness) {
+ while (i.hasNext()) {
+ uint cp = i.next();
+ qToBigEndian(cp, data);
+ data += 4;
+ }
+ } else {
+ while (i.hasNext()) {
+ uint cp = i.next();
+ qToLittleEndian(cp, data);
+ data += 4;
+ }
+ }
+
+ if (state) {
+ state->remainingChars = 0;
+ state->flags |= QStringConverter::IgnoreHeader;
+ }
+ return d;
+}
+
+QString QUtf32::convertToUnicode(const char *chars, qsizetype len, QStringConverter::State *state, DataEndianness e)
+{
+ DataEndianness endian = e;
+ uchar tuple[4];
+ int num = 0;
+ bool headerdone = false;
+ if (state) {
+ headerdone = state->flags & QStringConverter::IgnoreHeader;
+ if (endian == DetectEndianness) {
+ endian = (DataEndianness)state->state_data[Endian];
+ }
+ num = state->remainingChars;
+ memcpy(tuple, &state->state_data[Data], 4);
+ }
+ if (headerdone && endian == DetectEndianness)
+ endian = (QSysInfo::ByteOrder == QSysInfo::BigEndian) ? BigEndianness : LittleEndianness;
+
+ QString result;
+ result.resize((num + len) >> 2 << 1); // worst case
+ QChar *qch = (QChar *)result.data();
+
+ const char *end = chars + len;
+ while (chars < end) {
+ tuple[num++] = *chars++;
+ if (num == 4) {
+ if (!headerdone) {
+ headerdone = true;
+ if (endian == DetectEndianness) {
+ if (tuple[0] == 0xff && tuple[1] == 0xfe && tuple[2] == 0 && tuple[3] == 0 && endian != BigEndianness) {
+ endian = LittleEndianness;
+ num = 0;
+ continue;
+ } else if (tuple[0] == 0 && tuple[1] == 0 && tuple[2] == 0xfe && tuple[3] == 0xff && endian != LittleEndianness) {
+ endian = BigEndianness;
+ num = 0;
+ continue;
+ } else if (QSysInfo::ByteOrder == QSysInfo::BigEndian) {
+ endian = BigEndianness;
+ } else {
+ endian = LittleEndianness;
+ }
+ } else if (((endian == BigEndianness) ? qFromBigEndian<quint32>(tuple) : qFromLittleEndian<quint32>(tuple)) == QChar::ByteOrderMark) {
+ num = 0;
+ continue;
+ }
+ }
+ uint code = (endian == BigEndianness) ? qFromBigEndian<quint32>(tuple) : qFromLittleEndian<quint32>(tuple);
+ for (char16_t c : QChar::fromUcs4(code))
+ *qch++ = c;
+ num = 0;
+ }
+ }
+ result.truncate(qch - result.unicode());
+
+ if (state) {
+ if (headerdone)
+ state->flags |= QStringConverter::IgnoreHeader;
+ state->state_data[Endian] = endian;
+ state->remainingChars = num;
+ memcpy(&state->state_data[Data], tuple, 4);
+ }
+ return result;
+}
+
+QString qFromUtfEncoded(const QByteArray &ba)
+{
+ const qsizetype arraySize = ba.size();
+ const uchar *buf = reinterpret_cast<const uchar *>(ba.constData());
+ const uint bom = 0xfeff;
+
+ if (arraySize > 3) {
+ uint uc = qFromUnaligned<uint>(buf);
+ if (uc == qToBigEndian(bom) || uc == qToLittleEndian(bom))
+ return QUtf32::convertToUnicode(ba.constData(), ba.length(), nullptr); // utf-32
+ }
+
+ if (arraySize > 1) {
+ ushort uc = qFromUnaligned<ushort>(buf);
+ if (uc == qToBigEndian(ushort(bom)) || qToLittleEndian(ushort(bom)))
+ return QUtf16::convertToUnicode(ba.constData(), ba.length(), nullptr); // utf-16
+ }
+ return QUtf8::convertToUnicode(ba.constData(), ba.length());
+}
+
/*!
\enum QStringConverter::Flag
@@ -60,7 +1004,8 @@ void QStringConverter::State::clear()
{
if (clearFn)
clearFn(this);
- state_data[0] = state_data[1] = state_data[2] = state_data[3] = 0;
+ else
+ state_data[0] = state_data[1] = state_data[2] = state_data[3] = 0;
remainingChars = 0;
invalidChars = 0;
}
diff --git a/src/corelib/text/qstringconverter_p.h b/src/corelib/text/qstringconverter_p.h
new file mode 100644
index 0000000000..5764979542
--- /dev/null
+++ b/src/corelib/text/qstringconverter_p.h
@@ -0,0 +1,323 @@
+/****************************************************************************
+**
+** Copyright (C) 2018 The Qt Company Ltd.
+** Copyright (C) 2018 Intel Corporation.
+** Contact: https://www.qt.io/licensing/
+**
+** This file is part of the QtCore module of the Qt Toolkit.
+**
+** $QT_BEGIN_LICENSE:LGPL$
+** Commercial License Usage
+** Licensees holding valid commercial Qt licenses may use this file in
+** accordance with the commercial license agreement provided with the
+** Software or, alternatively, in accordance with the terms contained in
+** a written agreement between you and The Qt Company. For licensing terms
+** and conditions see https://www.qt.io/terms-conditions. For further
+** information use the contact form at https://www.qt.io/contact-us.
+**
+** GNU Lesser General Public License Usage
+** Alternatively, this file may be used under the terms of the GNU Lesser
+** General Public License version 3 as published by the Free Software
+** Foundation and appearing in the file LICENSE.LGPL3 included in the
+** packaging of this file. Please review the following information to
+** ensure the GNU Lesser General Public License version 3 requirements
+** will be met: https://www.gnu.org/licenses/lgpl-3.0.html.
+**
+** GNU General Public License Usage
+** Alternatively, this file may be used under the terms of the GNU
+** General Public License version 2.0 or (at your option) the GNU General
+** Public license version 3 or any later version approved by the KDE Free
+** Qt Foundation. The licenses are as published by the Free Software
+** Foundation and appearing in the file LICENSE.GPL2 and LICENSE.GPL3
+** included in the packaging of this file. Please review the following
+** information to ensure the GNU General Public License requirements will
+** be met: https://www.gnu.org/licenses/gpl-2.0.html and
+** https://www.gnu.org/licenses/gpl-3.0.html.
+**
+** $QT_END_LICENSE$
+**
+****************************************************************************/
+
+#ifndef QSTRINGCONVERTER_P_H
+#define QSTRINGCONVERTER_P_H
+
+//
+// W A R N I N G
+// -------------
+//
+// This file is not part of the Qt API. It exists purely as an
+// implementation detail. This header file may change from version to
+// version without notice, or even be removed.
+//
+// We mean it.
+//
+
+#include <QtCore/qstring.h>
+#include <QtCore/qendian.h>
+#include <QtCore/qstringconverter.h>
+
+QT_BEGIN_NAMESPACE
+
+struct QUtf8BaseTraits
+{
+ static const bool isTrusted = false;
+ static const bool allowNonCharacters = true;
+ static const bool skipAsciiHandling = false;
+ static const int Error = -1;
+ static const int EndOfString = -2;
+
+ static bool isValidCharacter(uint u)
+ { return int(u) >= 0; }
+
+ static void appendByte(uchar *&ptr, uchar b)
+ { *ptr++ = b; }
+
+ static uchar peekByte(const uchar *ptr, int n = 0)
+ { return ptr[n]; }
+
+ static qptrdiff availableBytes(const uchar *ptr, const uchar *end)
+ { return end - ptr; }
+
+ static void advanceByte(const uchar *&ptr, int n = 1)
+ { ptr += n; }
+
+ static void appendUtf16(ushort *&ptr, ushort uc)
+ { *ptr++ = uc; }
+
+ static void appendUcs4(ushort *&ptr, uint uc)
+ {
+ appendUtf16(ptr, QChar::highSurrogate(uc));
+ appendUtf16(ptr, QChar::lowSurrogate(uc));
+ }
+
+ static ushort peekUtf16(const ushort *ptr, int n = 0)
+ { return ptr[n]; }
+
+ static qptrdiff availableUtf16(const ushort *ptr, const ushort *end)
+ { return end - ptr; }
+
+ static void advanceUtf16(const ushort *&ptr, int n = 1)
+ { ptr += n; }
+
+ // it's possible to output to UCS-4 too
+ static void appendUtf16(uint *&ptr, ushort uc)
+ { *ptr++ = uc; }
+
+ static void appendUcs4(uint *&ptr, uint uc)
+ { *ptr++ = uc; }
+};
+
+struct QUtf8BaseTraitsNoAscii : public QUtf8BaseTraits
+{
+ static const bool skipAsciiHandling = true;
+};
+
+namespace QUtf8Functions
+{
+ /// returns 0 on success; errors can only happen if \a u is a surrogate:
+ /// Error if \a u is a low surrogate;
+ /// if \a u is a high surrogate, Error if the next isn't a low one,
+ /// EndOfString if we run into the end of the string.
+ template <typename Traits, typename OutputPtr, typename InputPtr> inline
+ int toUtf8(ushort u, OutputPtr &dst, InputPtr &src, InputPtr end)
+ {
+ if (!Traits::skipAsciiHandling && u < 0x80) {
+ // U+0000 to U+007F (US-ASCII) - one byte
+ Traits::appendByte(dst, uchar(u));
+ return 0;
+ } else if (u < 0x0800) {
+ // U+0080 to U+07FF - two bytes
+ // first of two bytes
+ Traits::appendByte(dst, 0xc0 | uchar(u >> 6));
+ } else {
+ if (!QChar::isSurrogate(u)) {
+ // U+0800 to U+FFFF (except U+D800-U+DFFF) - three bytes
+ if (!Traits::allowNonCharacters && QChar::isNonCharacter(u))
+ return Traits::Error;
+
+ // first of three bytes
+ Traits::appendByte(dst, 0xe0 | uchar(u >> 12));
+ } else {
+ // U+10000 to U+10FFFF - four bytes
+ // need to get one extra codepoint
+ if (Traits::availableUtf16(src, end) == 0)
+ return Traits::EndOfString;
+
+ ushort low = Traits::peekUtf16(src);
+ if (!QChar::isHighSurrogate(u))
+ return Traits::Error;
+ if (!QChar::isLowSurrogate(low))
+ return Traits::Error;
+
+ Traits::advanceUtf16(src);
+ uint ucs4 = QChar::surrogateToUcs4(u, low);
+
+ if (!Traits::allowNonCharacters && QChar::isNonCharacter(ucs4))
+ return Traits::Error;
+
+ // first byte
+ Traits::appendByte(dst, 0xf0 | (uchar(ucs4 >> 18) & 0xf));
+
+ // second of four bytes
+ Traits::appendByte(dst, 0x80 | (uchar(ucs4 >> 12) & 0x3f));
+
+ // for the rest of the bytes
+ u = ushort(ucs4);
+ }
+
+ // second to last byte
+ Traits::appendByte(dst, 0x80 | (uchar(u >> 6) & 0x3f));
+ }
+
+ // last byte
+ Traits::appendByte(dst, 0x80 | (u & 0x3f));
+ return 0;
+ }
+
+ inline bool isContinuationByte(uchar b)
+ {
+ return (b & 0xc0) == 0x80;
+ }
+
+ /// returns the number of characters consumed (including \a b) in case of success;
+ /// returns negative in case of error: Traits::Error or Traits::EndOfString
+ template <typename Traits, typename OutputPtr, typename InputPtr> inline
+ int fromUtf8(uchar b, OutputPtr &dst, InputPtr &src, InputPtr end)
+ {
+ int charsNeeded;
+ uint min_uc;
+ uint uc;
+
+ if (!Traits::skipAsciiHandling && b < 0x80) {
+ // US-ASCII
+ Traits::appendUtf16(dst, b);
+ return 1;
+ }
+
+ if (!Traits::isTrusted && Q_UNLIKELY(b <= 0xC1)) {
+ // an UTF-8 first character must be at least 0xC0
+ // however, all 0xC0 and 0xC1 first bytes can only produce overlong sequences
+ return Traits::Error;
+ } else if (b < 0xe0) {
+ charsNeeded = 2;
+ min_uc = 0x80;
+ uc = b & 0x1f;
+ } else if (b < 0xf0) {
+ charsNeeded = 3;
+ min_uc = 0x800;
+ uc = b & 0x0f;
+ } else if (b < 0xf5) {
+ charsNeeded = 4;
+ min_uc = 0x10000;
+ uc = b & 0x07;
+ } else {
+ // the last Unicode character is U+10FFFF
+ // it's encoded in UTF-8 as "\xF4\x8F\xBF\xBF"
+ // therefore, a byte higher than 0xF4 is not the UTF-8 first byte
+ return Traits::Error;
+ }
+
+ int bytesAvailable = Traits::availableBytes(src, end);
+ if (Q_UNLIKELY(bytesAvailable < charsNeeded - 1)) {
+ // it's possible that we have an error instead of just unfinished bytes
+ if (bytesAvailable > 0 && !isContinuationByte(Traits::peekByte(src, 0)))
+ return Traits::Error;
+ if (bytesAvailable > 1 && !isContinuationByte(Traits::peekByte(src, 1)))
+ return Traits::Error;
+ return Traits::EndOfString;
+ }
+
+ // first continuation character
+ b = Traits::peekByte(src, 0);
+ if (!isContinuationByte(b))
+ return Traits::Error;
+ uc <<= 6;
+ uc |= b & 0x3f;
+
+ if (charsNeeded > 2) {
+ // second continuation character
+ b = Traits::peekByte(src, 1);
+ if (!isContinuationByte(b))
+ return Traits::Error;
+ uc <<= 6;
+ uc |= b & 0x3f;
+
+ if (charsNeeded > 3) {
+ // third continuation character
+ b = Traits::peekByte(src, 2);
+ if (!isContinuationByte(b))
+ return Traits::Error;
+ uc <<= 6;
+ uc |= b & 0x3f;
+ }
+ }
+
+ // we've decoded something; safety-check it
+ if (!Traits::isTrusted) {
+ if (uc < min_uc)
+ return Traits::Error;
+ if (QChar::isSurrogate(uc) || uc > QChar::LastValidCodePoint)
+ return Traits::Error;
+ if (!Traits::allowNonCharacters && QChar::isNonCharacter(uc))
+ return Traits::Error;
+ }
+
+ // write the UTF-16 sequence
+ if (!QChar::requiresSurrogates(uc)) {
+ // UTF-8 decoded and no surrogates are required
+ // detach if necessary
+ Traits::appendUtf16(dst, ushort(uc));
+ } else {
+ // UTF-8 decoded to something that requires a surrogate pair
+ Traits::appendUcs4(dst, uc);
+ }
+
+ Traits::advanceByte(src, charsNeeded - 1);
+ return charsNeeded;
+ }
+}
+
+enum DataEndianness
+{
+ DetectEndianness,
+ BigEndianness,
+ LittleEndianness
+};
+
+struct QUtf8
+{
+ static QChar *convertToUnicode(QChar *, const char *, qsizetype) noexcept;
+ static QString convertToUnicode(const char *, qsizetype);
+ static QString convertToUnicode(const char *, qsizetype, QStringConverter::State *);
+ static QByteArray convertFromUnicode(const QChar *, qsizetype);
+ static QByteArray convertFromUnicode(const QChar *, qsizetype, QStringConverter::State *);
+ struct ValidUtf8Result {
+ bool isValidUtf8;
+ bool isValidAscii;
+ };
+ static ValidUtf8Result isValidUtf8(const char *, qsizetype);
+ static int compareUtf8(const char *, qsizetype, const QChar *, qsizetype);
+ static int compareUtf8(const char *, qsizetype, QLatin1String s);
+};
+
+struct QUtf16
+{
+ static QString convertToUnicode(const char *, qsizetype, QStringConverter::State *, DataEndianness = DetectEndianness);
+ static QByteArray convertFromUnicode(const QChar *, qsizetype, QStringConverter::State *, DataEndianness = DetectEndianness);
+};
+
+struct QUtf32
+{
+ static QString convertToUnicode(const char *, qsizetype, QStringConverter::State *, DataEndianness = DetectEndianness);
+ static QByteArray convertFromUnicode(const QChar *, qsizetype, QStringConverter::State *, DataEndianness = DetectEndianness);
+};
+
+/*
+ Converts from different utf encodings looking at a possible byte order mark at the
+ beginning of the string. If no BOM exists, utf-8 is assumed.
+ */
+Q_CORE_EXPORT QString qFromUtfEncoded(const QByteArray &ba);
+
+QT_END_NAMESPACE
+
+#endif // QSTRINGCONVERTER_P_H
diff --git a/src/corelib/text/text.pri b/src/corelib/text/text.pri
index 4c584cf958..1275c014a8 100644
--- a/src/corelib/text/text.pri
+++ b/src/corelib/text/text.pri
@@ -20,6 +20,7 @@ HEADERS += \
text/qstringalgorithms_p.h \
text/qstringbuilder.h \
text/qstringconverter.h \
+ text/qstringconverter_p.h \
text/qstringiterator_p.h \
text/qstringlist.h \
text/qstringliteral.h \
diff --git a/src/gui/kernel/qclipboard.cpp b/src/gui/kernel/qclipboard.cpp
index 72f27d3e49..3b42e78624 100644
--- a/src/gui/kernel/qclipboard.cpp
+++ b/src/gui/kernel/qclipboard.cpp
@@ -46,9 +46,7 @@
#include "qvariant.h"
#include "qbuffer.h"
#include "qimage.h"
-#if QT_CONFIG(textcodec)
-#include "private/qutfcodec_p.h"
-#endif
+#include "private/qstringconverter_p.h"
#include "private/qguiapplication_p.h"
#include <qpa/qplatformintegration.h>
diff --git a/src/tools/bootstrap/.prev_CMakeLists.txt b/src/tools/bootstrap/.prev_CMakeLists.txt
index 8f430c494e..f81e03adff 100644
--- a/src/tools/bootstrap/.prev_CMakeLists.txt
+++ b/src/tools/bootstrap/.prev_CMakeLists.txt
@@ -41,9 +41,6 @@ qt_add_module(Bootstrap
../../3rdparty/pcre2/src/pcre2_ucp.h
../../3rdparty/pcre2/src/pcre2_valid_utf.c
../../3rdparty/pcre2/src/pcre2_xclass.c
- ../../corelib/codecs/qlatincodec.cpp
- ../../corelib/codecs/qtextcodec.cpp
- ../../corelib/codecs/qutfcodec.cpp
../../corelib/global/qendian.cpp
../../corelib/global/qglobal.cpp
../../corelib/global/qlogging.cpp
@@ -109,7 +106,6 @@ qt_add_module(Bootstrap
../../corelib/text/qstringbuilder.cpp
../../corelib/text/qstringconverter.cpp
../../corelib/text/qstringlist.cpp
- ../../corelib/text/qstringview.cpp
../../corelib/text/qvsnprintf.cpp
../../corelib/time/qcalendar.cpp
../../corelib/time/qdatetime.cpp
diff --git a/src/tools/bootstrap/CMakeLists.txt b/src/tools/bootstrap/CMakeLists.txt
index 5a17888003..a5184fbb80 100644
--- a/src/tools/bootstrap/CMakeLists.txt
+++ b/src/tools/bootstrap/CMakeLists.txt
@@ -42,9 +42,6 @@ qt_extend_target(Bootstrap
../../3rdparty/pcre2/src/pcre2_ucp.h
../../3rdparty/pcre2/src/pcre2_valid_utf.c
../../3rdparty/pcre2/src/pcre2_xclass.c
- ../../corelib/codecs/qlatincodec.cpp
- ../../corelib/codecs/qtextcodec.cpp
- ../../corelib/codecs/qutfcodec.cpp
../../corelib/global/qendian.cpp
../../corelib/global/qglobal.cpp
../../corelib/global/qlogging.cpp
@@ -110,7 +107,6 @@ qt_extend_target(Bootstrap
../../corelib/text/qstringbuilder.cpp
../../corelib/text/qstringconverter.cpp
../../corelib/text/qstringlist.cpp
- ../../corelib/text/qstringview.cpp
../../corelib/text/qvsnprintf.cpp
../../corelib/time/qcalendar.cpp
../../corelib/time/qdatetime.cpp
diff --git a/src/tools/bootstrap/bootstrap.pro b/src/tools/bootstrap/bootstrap.pro
index 169c5fe1c2..5b7da8e687 100644
--- a/src/tools/bootstrap/bootstrap.pro
+++ b/src/tools/bootstrap/bootstrap.pro
@@ -28,9 +28,6 @@ INCLUDEPATH += \
$$PWD/../../3rdparty/pcre2/src
SOURCES += \
- ../../corelib/codecs/qlatincodec.cpp \
- ../../corelib/codecs/qtextcodec.cpp \
- ../../corelib/codecs/qutfcodec.cpp \
../../corelib/global/qendian.cpp \
../../corelib/global/qglobal.cpp \
../../corelib/global/qlogging.cpp \
@@ -96,7 +93,6 @@ SOURCES += \
../../corelib/text/qstringconverter.cpp \
../../corelib/text/qstring_compat.cpp \
../../corelib/text/qstringlist.cpp \
- ../../corelib/text/qstringview.cpp \
../../corelib/text/qvsnprintf.cpp \
../../corelib/time/qcalendar.cpp \
../../corelib/time/qdatetime.cpp \