summaryrefslogtreecommitdiffstats
path: root/src/corelib/text/qstringconverter.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'src/corelib/text/qstringconverter.cpp')
-rw-r--r--src/corelib/text/qstringconverter.cpp1265
1 files changed, 933 insertions, 332 deletions
diff --git a/src/corelib/text/qstringconverter.cpp b/src/corelib/text/qstringconverter.cpp
index df9efe7f67..efa625e30b 100644
--- a/src/corelib/text/qstringconverter.cpp
+++ b/src/corelib/text/qstringconverter.cpp
@@ -1,42 +1,6 @@
-/****************************************************************************
-**
-** Copyright (C) 2020 The Qt Company Ltd.
-** Copyright (C) 2020 Intel Corporation.
-** Contact: https://www.qt.io/licensing/
-**
-** This file is part of the QtCore module of the Qt Toolkit.
-**
-** $QT_BEGIN_LICENSE:LGPL$
-** Commercial License Usage
-** Licensees holding valid commercial Qt licenses may use this file in
-** accordance with the commercial license agreement provided with the
-** Software or, alternatively, in accordance with the terms contained in
-** a written agreement between you and The Qt Company. For licensing terms
-** and conditions see https://www.qt.io/terms-conditions. For further
-** information use the contact form at https://www.qt.io/contact-us.
-**
-** GNU Lesser General Public License Usage
-** Alternatively, this file may be used under the terms of the GNU Lesser
-** General Public License version 3 as published by the Free Software
-** Foundation and appearing in the file LICENSE.LGPL3 included in the
-** packaging of this file. Please review the following information to
-** ensure the GNU Lesser General Public License version 3 requirements
-** will be met: https://www.gnu.org/licenses/lgpl-3.0.html.
-**
-** GNU General Public License Usage
-** Alternatively, this file may be used under the terms of the GNU
-** General Public License version 2.0 or (at your option) the GNU General
-** Public license version 3 or any later version approved by the KDE Free
-** Qt Foundation. The licenses are as published by the Free Software
-** Foundation and appearing in the file LICENSE.GPL2 and LICENSE.GPL3
-** included in the packaging of this file. Please review the following
-** information to ensure the GNU General Public License requirements will
-** be met: https://www.gnu.org/licenses/gpl-2.0.html and
-** https://www.gnu.org/licenses/gpl-3.0.html.
-**
-** $QT_END_LICENSE$
-**
-****************************************************************************/
+// Copyright (C) 2020 The Qt Company Ltd.
+// Copyright (C) 2020 Intel Corporation.
+// SPDX-License-Identifier: LicenseRef-Qt-Commercial OR LGPL-3.0-only OR GPL-2.0-only OR GPL-3.0-only
#include <qstringconverter.h>
#include <private/qstringconverter_p.h>
@@ -44,24 +8,47 @@
#include "private/qsimd_p.h"
#include "private/qstringiterator_p.h"
+#include "private/qtools_p.h"
#include "qbytearraymatcher.h"
+#include "qcontainertools_impl.h"
+#include <QtCore/qbytearraylist.h>
+
+#if QT_CONFIG(icu)
+#include <unicode/ucnv.h>
+#include <unicode/ucnv_cb.h>
+#include <unicode/ucnv_err.h>
+#include <unicode/ustring.h>
+#endif
#ifdef Q_OS_WIN
#include <qt_windows.h>
+#ifndef QT_BOOTSTRAPPED
+#include <QtCore/qvarlengtharray.h>
+#include <QtCore/q20iterator.h>
+#include <QtCore/private/qnumeric_p.h>
+#endif // !QT_BOOTSTRAPPED
#endif
+#include <array>
+
#if __has_include(<bit>) && __cplusplus > 201703L
#include <bit>
#endif
QT_BEGIN_NAMESPACE
+using namespace QtMiscUtils;
+
+static_assert(std::is_nothrow_move_constructible_v<QStringEncoder>);
+static_assert(std::is_nothrow_move_assignable_v<QStringEncoder>);
+static_assert(std::is_nothrow_move_constructible_v<QStringDecoder>);
+static_assert(std::is_nothrow_move_assignable_v<QStringDecoder>);
+
enum { Endian = 0, Data = 1 };
static const uchar utf8bom[] = { 0xef, 0xbb, 0xbf };
-#if (defined(__SSE2__) && defined(QT_COMPILER_SUPPORTS_SSE2)) \
- || defined(__ARM_NEON__)
+#if defined(__SSE2__) || defined(__ARM_NEON__)
static Q_ALWAYS_INLINE uint qBitScanReverse(unsigned v) noexcept
{
#if defined(__cpp_lib_int_pow2) && __cpp_lib_int_pow2 >= 202002L
@@ -77,8 +64,8 @@ static Q_ALWAYS_INLINE uint qBitScanReverse(unsigned v) noexcept
}
#endif
-#if defined(__SSE2__) && defined(QT_COMPILER_SUPPORTS_SSE2)
-static inline bool simdEncodeAscii(uchar *&dst, const ushort *&nextAscii, const ushort *&src, const ushort *end)
+#if defined(__SSE2__)
+static inline bool simdEncodeAscii(uchar *&dst, const char16_t *&nextAscii, const char16_t *&src, const char16_t *end)
{
// do sixteen characters at a time
for ( ; end - src >= 16; src += 16, dst += 16) {
@@ -142,7 +129,7 @@ static inline bool simdEncodeAscii(uchar *&dst, const ushort *&nextAscii, const
return src == end;
}
-static inline bool simdDecodeAscii(ushort *&dst, const uchar *&nextAscii, const uchar *&src, const uchar *end)
+static inline bool simdDecodeAscii(char16_t *&dst, const uchar *&nextAscii, const uchar *&src, const uchar *end)
{
// do sixteen characters at a time
for ( ; end - src >= 16; src += 16, dst += 16) {
@@ -214,14 +201,14 @@ static inline const uchar *simdFindNonAscii(const uchar *src, const uchar *end,
#ifdef __AVX2__
// do 32 characters at a time
// (this is similar to simdTestMask in qstring.cpp)
- const __m256i mask = _mm256_set1_epi8(0x80);
+ const __m256i mask = _mm256_set1_epi8(char(0x80));
for ( ; end - src >= 32; src += 32) {
__m256i data = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src));
if (_mm256_testz_si256(mask, data))
continue;
uint n = _mm256_movemask_epi8(data);
- Q_ASSUME(n);
+ Q_ASSERT(n);
// find the next probable ASCII character
// we don't want to load 32 bytes again in this loop if we know there are non-ASCII
@@ -271,7 +258,7 @@ static inline const uchar *simdFindNonAscii(const uchar *src, const uchar *end,
// Compare only the US-ASCII beginning of [src8, end8) and [src16, end16)
// and advance src8 and src16 to the first character that could not be compared
-static void simdCompareAscii(const char8_t *&src8, const char8_t *end8, const char16_t *&src16, const char16_t *end16)
+static void simdCompareAscii(const qchar8_t *&src8, const qchar8_t *end8, const char16_t *&src16, const char16_t *end16)
{
int bitSpacing = 1;
qptrdiff len = qMin(end8 - src8, end16 - src16);
@@ -361,7 +348,7 @@ static void simdCompareAscii(const char8_t *&src8, const char8_t *end8, const ch
src16 += offset;
}
#elif defined(__ARM_NEON__)
-static inline bool simdEncodeAscii(uchar *&dst, const ushort *&nextAscii, const ushort *&src, const ushort *end)
+static inline bool simdEncodeAscii(uchar *&dst, const char16_t *&nextAscii, const char16_t *&src, const char16_t *end)
{
uint16x8_t maxAscii = vdupq_n_u16(0x7f);
uint16x8_t mask1 = { 1, 1 << 2, 1 << 4, 1 << 6, 1 << 8, 1 << 10, 1 << 12, 1 << 14 };
@@ -370,7 +357,7 @@ static inline bool simdEncodeAscii(uchar *&dst, const ushort *&nextAscii, const
// do sixteen characters at a time
for ( ; end - src >= 16; src += 16, dst += 16) {
// load 2 lanes (or: "load interleaved")
- uint16x8x2_t in = vld2q_u16(src);
+ uint16x8x2_t in = vld2q_u16(reinterpret_cast<const uint16_t *>(src));
// check if any of the elements > 0x7f, select 1 bit per element (element 0 -> bit 0, element 1 -> bit 1, etc),
// add those together into a scalar, and merge the scalars.
@@ -398,7 +385,7 @@ static inline bool simdEncodeAscii(uchar *&dst, const ushort *&nextAscii, const
return src == end;
}
-static inline bool simdDecodeAscii(ushort *&dst, const uchar *&nextAscii, const uchar *&src, const uchar *end)
+static inline bool simdDecodeAscii(char16_t *&dst, const uchar *&nextAscii, const uchar *&src, const uchar *end)
{
// do eight characters at a time
uint8x8_t msb_mask = vdup_n_u8(0x80);
@@ -408,7 +395,7 @@ static inline bool simdDecodeAscii(ushort *&dst, const uchar *&nextAscii, const
uint8_t n = vaddv_u8(vand_u8(vcge_u8(c, msb_mask), add_mask));
if (!n) {
// store
- vst1q_u16(dst, vmovl_u8(c));
+ vst1q_u16(reinterpret_cast<uint16_t *>(dst), vmovl_u8(c));
continue;
}
@@ -457,16 +444,16 @@ static inline const uchar *simdFindNonAscii(const uchar *src, const uchar *end,
return src;
}
-static void simdCompareAscii(const char8_t *&, const char8_t *, const char16_t *&, const char16_t *)
+static void simdCompareAscii(const qchar8_t *&, const qchar8_t *, const char16_t *&, const char16_t *)
{
}
#else
-static inline bool simdEncodeAscii(uchar *, const ushort *, const ushort *, const ushort *)
+static inline bool simdEncodeAscii(uchar *, const char16_t *, const char16_t *, const char16_t *)
{
return false;
}
-static inline bool simdDecodeAscii(ushort *, const uchar *, const uchar *, const uchar *)
+static inline bool simdDecodeAscii(char16_t *, const uchar *, const uchar *, const uchar *)
{
return false;
}
@@ -477,7 +464,7 @@ static inline const uchar *simdFindNonAscii(const uchar *src, const uchar *end,
return src;
}
-static void simdCompareAscii(const char8_t *&, const char8_t *, const char16_t *&, const char16_t *)
+static void simdCompareAscii(const qchar8_t *&, const qchar8_t *, const char16_t *&, const char16_t *)
{
}
#endif
@@ -491,16 +478,16 @@ QByteArray QUtf8::convertFromUnicode(QStringView in)
// create a QByteArray with the worst case scenario size
QByteArray result(len * 3, Qt::Uninitialized);
uchar *dst = reinterpret_cast<uchar *>(const_cast<char *>(result.constData()));
- const ushort *src = reinterpret_cast<const ushort *>(in.data());
- const ushort *const end = src + len;
+ const char16_t *src = reinterpret_cast<const char16_t *>(in.data());
+ const char16_t *const end = src + len;
while (src != end) {
- const ushort *nextAscii = end;
+ const char16_t *nextAscii = end;
if (simdEncodeAscii(dst, nextAscii, src, end))
break;
do {
- ushort u = *src++;
+ char16_t u = *src++;
int res = QUtf8Functions::toUtf8<QUtf8BaseTraits>(u, dst, src, end);
if (res < 0) {
// encoding error - append '?'
@@ -524,8 +511,7 @@ QByteArray QUtf8::convertFromUnicode(QStringView in, QStringConverterBase::State
char *QUtf8::convertFromUnicode(char *out, QStringView in, QStringConverter::State *state)
{
Q_ASSERT(state);
- const QChar *uc = in.data();
- qsizetype len = in.length();
+ qsizetype len = in.size();
if (!len)
return out;
@@ -542,8 +528,8 @@ char *QUtf8::convertFromUnicode(char *out, QStringView in, QStringConverter::Sta
};
uchar *cursor = reinterpret_cast<uchar *>(out);
- const ushort *src = reinterpret_cast<const ushort *>(uc);
- const ushort *const end = src + len;
+ const char16_t *src = in.utf16();
+ const char16_t *const end = src + len;
if (!(state->flags & QStringDecoder::Flag::Stateless)) {
if (state->remainingChars) {
@@ -562,12 +548,12 @@ char *QUtf8::convertFromUnicode(char *out, QStringView in, QStringConverter::Sta
}
while (src != end) {
- const ushort *nextAscii = end;
+ const char16_t *nextAscii = end;
if (simdEncodeAscii(cursor, nextAscii, src, end))
break;
do {
- ushort uc = *src++;
+ char16_t uc = *src++;
int res = QUtf8Functions::toUtf8<QUtf8BaseTraits>(uc, cursor, src, end);
if (Q_LIKELY(res >= 0))
continue;
@@ -592,6 +578,21 @@ char *QUtf8::convertFromUnicode(char *out, QStringView in, QStringConverter::Sta
return reinterpret_cast<char *>(cursor);
}
+char *QUtf8::convertFromLatin1(char *out, QLatin1StringView in)
+{
+ // ### SIMD-optimize:
+ for (uchar ch : in) {
+ if (ch < 128) {
+ *out++ = ch;
+ } else {
+ // as per https://en.wikipedia.org/wiki/UTF-8#Encoding, 2nd row
+ *out++ = 0b110'0'0000u | (ch >> 6);
+ *out++ = 0b10'00'0000u | (ch & 0b0011'1111);
+ }
+ }
+ return out;
+}
+
QString QUtf8::convertToUnicode(QByteArrayView in)
{
// UTF-8 to UTF-16 always needs the exact same number of words or less:
@@ -613,14 +614,14 @@ QString QUtf8::convertToUnicode(QByteArrayView in)
return result;
}
-/*!
- \since 5.7
+/*! \internal
+ \since 6.6
\overload
Converts the UTF-8 sequence of bytes viewed by \a in to a sequence of
- QChar starting at \a buffer. The buffer is expected to be large enough
- to hold the result. An upper bound for the size of the buffer is
- \c in.size() QChars.
+ QChar starting at \a dst in the destination buffer. The buffer is expected
+ to be large enough to hold the result. An upper bound for the size of the
+ buffer is \c in.size() QChars.
If, during decoding, an error occurs, a QChar::ReplacementCharacter is
written.
@@ -628,11 +629,12 @@ QString QUtf8::convertToUnicode(QByteArrayView in)
Returns a pointer to one past the last QChar written.
This function never throws.
-*/
-QChar *QUtf8::convertToUnicode(QChar *buffer, QByteArrayView in) noexcept
+ For QChar buffers, instead of casting manually, you can use the static
+ QUtf8::convertToUnicode(QChar *, QByteArrayView) directly.
+*/
+char16_t *QUtf8::convertToUnicode(char16_t *dst, QByteArrayView in) noexcept
{
- ushort *dst = reinterpret_cast<ushort *>(buffer);
const uchar *const start = reinterpret_cast<const uchar *>(in.data());
const uchar *src = start;
const uchar *end = src + in.size();
@@ -655,7 +657,7 @@ QChar *QUtf8::convertToUnicode(QChar *buffer, QByteArrayView in) noexcept
do {
uchar b = *src++;
- int res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(b, dst, src, end);
+ const qsizetype res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(b, dst, src, end);
if (res < 0) {
// decoding error
*dst++ = QChar::ReplacementCharacter;
@@ -664,7 +666,7 @@ QChar *QUtf8::convertToUnicode(QChar *buffer, QByteArrayView in) noexcept
}
}
- return reinterpret_cast<QChar *>(dst);
+ return dst;
}
QString QUtf8::convertToUnicode(QByteArrayView in, QStringConverter::State *state)
@@ -685,23 +687,22 @@ QString QUtf8::convertToUnicode(QByteArrayView in, QStringConverter::State *stat
return result;
}
-QChar *QUtf8::convertToUnicode(QChar *out, QByteArrayView in, QStringConverter::State *state)
+char16_t *QUtf8::convertToUnicode(char16_t *dst, QByteArrayView in, QStringConverter::State *state)
{
qsizetype len = in.size();
Q_ASSERT(state);
if (!len)
- return out;
+ return dst;
- ushort replacement = QChar::ReplacementCharacter;
+ char16_t replacement = QChar::ReplacementCharacter;
if (state->flags & QStringConverter::Flag::ConvertInvalidToNull)
replacement = QChar::Null;
- int res;
+ qsizetype res;
uchar ch = 0;
- ushort *dst = reinterpret_cast<ushort *>(out);
const uchar *src = reinterpret_cast<const uchar *>(in.data());
const uchar *end = src + len;
@@ -729,7 +730,7 @@ QChar *QUtf8::convertToUnicode(QChar *out, QByteArrayView in, QStringConverter::
// copy to our state and return
state->remainingChars = remainingCharsCount + newCharsToCopy;
memcpy(&state->state_data[0], remainingCharsData, state->remainingChars);
- return out;
+ return dst;
} else if (!headerdone) {
// eat the UTF-8 BOM
if (dst[-1] == 0xfeff)
@@ -785,14 +786,14 @@ QChar *QUtf8::convertToUnicode(QChar *out, QByteArrayView in, QStringConverter::
state->remainingChars = 0;
}
- return reinterpret_cast<QChar *>(dst);
+ return dst;
}
struct QUtf8NoOutputTraits : public QUtf8BaseTraitsNoAscii
{
struct NoOutput {};
- static void appendUtf16(const NoOutput &, ushort) {}
- static void appendUcs4(const NoOutput &, uint) {}
+ static void appendUtf16(const NoOutput &, char16_t) {}
+ static void appendUcs4(const NoOutput &, char32_t) {}
};
QUtf8::ValidUtf8Result QUtf8::isValidUtf8(QByteArrayView in)
@@ -815,7 +816,7 @@ QUtf8::ValidUtf8Result QUtf8::isValidUtf8(QByteArrayView in)
isValidAscii = false;
QUtf8NoOutputTraits::NoOutput output;
- int res = QUtf8Functions::fromUtf8<QUtf8NoOutputTraits>(b, output, src, end);
+ const qsizetype res = QUtf8Functions::fromUtf8<QUtf8NoOutputTraits>(b, output, src, end);
if (res < 0) {
// decoding error
return { false, false };
@@ -826,9 +827,9 @@ QUtf8::ValidUtf8Result QUtf8::isValidUtf8(QByteArrayView in)
return { true, isValidAscii };
}
-int QUtf8::compareUtf8(QByteArrayView utf8, QStringView utf16) noexcept
+int QUtf8::compareUtf8(QByteArrayView utf8, QStringView utf16, Qt::CaseSensitivity cs) noexcept
{
- auto src1 = reinterpret_cast<const char8_t *>(utf8.data());
+ auto src1 = reinterpret_cast<const qchar8_t *>(utf8.data());
auto end1 = src1 + utf8.size();
auto src2 = reinterpret_cast<const char16_t *>(utf16.data());
auto end2 = src2 + utf16.size();
@@ -842,7 +843,7 @@ int QUtf8::compareUtf8(QByteArrayView utf8, QStringView utf16) noexcept
if (uc1 >= 0x80) {
char32_t *output = &uc1;
- int res = QUtf8Functions::fromUtf8<QUtf8BaseTraitsNoAscii>(uc1, output, src1, end1);
+ qsizetype res = QUtf8Functions::fromUtf8<QUtf8BaseTraitsNoAscii>(uc1, output, src1, end1);
if (res < 0) {
// decoding error
uc1 = QChar::ReplacementCharacter;
@@ -853,7 +854,10 @@ int QUtf8::compareUtf8(QByteArrayView utf8, QStringView utf16) noexcept
if (QChar::isHighSurrogate(uc2) && src2 < end2 && QChar::isLowSurrogate(*src2))
uc2 = QChar::surrogateToUcs4(uc2, *src2++);
}
-
+ if (cs == Qt::CaseInsensitive) {
+ uc1 = QChar::toCaseFolded(uc1);
+ uc2 = QChar::toCaseFolded(uc2);
+ }
if (uc1 != uc2)
return int(uc1) - int(uc2);
}
@@ -863,9 +867,9 @@ int QUtf8::compareUtf8(QByteArrayView utf8, QStringView utf16) noexcept
return (end1 > src1) - int(end2 > src2);
}
-int QUtf8::compareUtf8(QByteArrayView utf8, QLatin1String s)
+int QUtf8::compareUtf8(QByteArrayView utf8, QLatin1StringView s, Qt::CaseSensitivity cs)
{
- uint uc1 = QChar::Null;
+ char32_t uc1 = QChar::Null;
auto src1 = reinterpret_cast<const uchar *>(utf8.data());
auto end1 = src1 + utf8.size();
auto src2 = reinterpret_cast<const uchar *>(s.latin1());
@@ -873,14 +877,18 @@ int QUtf8::compareUtf8(QByteArrayView utf8, QLatin1String s)
while (src1 < end1 && src2 < end2) {
uchar b = *src1++;
- uint *output = &uc1;
- int res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(b, output, src1, end1);
+ char32_t *output = &uc1;
+ const qsizetype res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(b, output, src1, end1);
if (res < 0) {
// decoding error
uc1 = QChar::ReplacementCharacter;
}
- uint uc2 = *src2++;
+ char32_t uc2 = *src2++;
+ if (cs == Qt::CaseInsensitive) {
+ uc1 = QChar::toCaseFolded(uc1);
+ uc2 = QChar::toCaseFolded(uc2);
+ }
if (uc1 != uc2)
return int(uc1) - int(uc2);
}
@@ -889,6 +897,52 @@ int QUtf8::compareUtf8(QByteArrayView utf8, QLatin1String s)
return (end1 > src1) - (end2 > src2);
}
+int QUtf8::compareUtf8(QByteArrayView lhs, QByteArrayView rhs, Qt::CaseSensitivity cs) noexcept
+{
+ if (lhs.isEmpty())
+ return qt_lencmp(0, rhs.size());
+
+ if (cs == Qt::CaseSensitive) {
+ const auto l = std::min(lhs.size(), rhs.size());
+ int r = memcmp(lhs.data(), rhs.data(), l);
+ return r ? r : qt_lencmp(lhs.size(), rhs.size());
+ }
+
+ char32_t uc1 = QChar::Null;
+ auto src1 = reinterpret_cast<const uchar *>(lhs.data());
+ auto end1 = src1 + lhs.size();
+ char32_t uc2 = QChar::Null;
+ auto src2 = reinterpret_cast<const uchar *>(rhs.data());
+ auto end2 = src2 + rhs.size();
+
+ while (src1 < end1 && src2 < end2) {
+ uchar b = *src1++;
+ char32_t *output = &uc1;
+ qsizetype res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(b, output, src1, end1);
+ if (res < 0) {
+ // decoding error
+ uc1 = QChar::ReplacementCharacter;
+ }
+
+ b = *src2++;
+ output = &uc2;
+ res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(b, output, src2, end2);
+ if (res < 0) {
+ // decoding error
+ uc2 = QChar::ReplacementCharacter;
+ }
+
+ uc1 = QChar::toCaseFolded(uc1);
+ uc2 = QChar::toCaseFolded(uc2);
+ if (uc1 != uc2)
+ return int(uc1) - int(uc2);
+ }
+
+ // the shorter string sorts first
+ return (end1 > src1) - (end2 > src2);
+}
+
+#ifndef QT_BOOTSTRAPPED
QByteArray QUtf16::convertFromUnicode(QStringView in, QStringConverter::State *state, DataEndianness endian)
{
bool writeBom = !(state->internalState & HeaderDone) && state->flags & QStringConverter::Flag::WriteBom;
@@ -898,7 +952,7 @@ QByteArray QUtf16::convertFromUnicode(QStringView in, QStringConverter::State *s
QByteArray d(length, Qt::Uninitialized);
char *end = convertFromUnicode(d.data(), in, state, endian);
- Q_ASSERT(end - d.constData() == d.length());
+ Q_ASSERT(end - d.constData() == d.size());
Q_UNUSED(end);
return d;
}
@@ -921,13 +975,13 @@ char *QUtf16::convertFromUnicode(char *out, QStringView in, QStringConverter::St
out += 2;
}
if (endian == BigEndianness)
- qToBigEndian<ushort>(in.data(), in.length(), out);
+ qToBigEndian<char16_t>(in.data(), in.size(), out);
else
- qToLittleEndian<ushort>(in.data(), in.length(), out);
+ qToLittleEndian<char16_t>(in.data(), in.size(), out);
state->remainingChars = 0;
state->internalState |= HeaderDone;
- return out + 2*in.length();
+ return out + 2*in.size();
}
QString QUtf16::convertToUnicode(QByteArrayView in, QStringConverter::State *state, DataEndianness endian)
@@ -996,11 +1050,11 @@ QChar *QUtf16::convertToUnicode(QChar *out, QByteArrayView in, QStringConverter:
endian = (QSysInfo::ByteOrder == QSysInfo::BigEndian) ? BigEndianness : LittleEndianness;
}
- int nPairs = (end - chars) >> 1;
+ qsizetype nPairs = (end - chars) >> 1;
if (endian == BigEndianness)
- qFromBigEndian<ushort>(chars, nPairs, out);
+ qFromBigEndian<char16_t>(chars, nPairs, out);
else
- qFromLittleEndian<ushort>(chars, nPairs, out);
+ qFromLittleEndian<char16_t>(chars, nPairs, out);
out += nPairs;
state->state_data[Endian] = endian;
@@ -1022,13 +1076,12 @@ QChar *QUtf16::convertToUnicode(QChar *out, QByteArrayView in, QStringConverter:
QByteArray QUtf32::convertFromUnicode(QStringView in, QStringConverter::State *state, DataEndianness endian)
{
bool writeBom = !(state->internalState & HeaderDone) && state->flags & QStringConverter::Flag::WriteBom;
- int length = 4*in.size();
+ qsizetype length = 4*in.size();
if (writeBom)
length += 4;
QByteArray ba(length, Qt::Uninitialized);
char *end = convertFromUnicode(ba.data(), in, state, endian);
- Q_ASSERT(end - ba.constData() == length);
- Q_UNUSED(end);
+ ba.truncate(end - ba.constData());
return ba;
}
@@ -1037,10 +1090,6 @@ char *QUtf32::convertFromUnicode(char *out, QStringView in, QStringConverter::St
Q_ASSERT(state);
bool writeBom = !(state->internalState & HeaderDone) && state->flags & QStringConverter::Flag::WriteBom;
- qsizetype length = 4*in.length();
- if (writeBom)
- length += 4;
-
if (endian == DetectEndianness)
endian = (QSysInfo::ByteOrder == QSysInfo::BigEndian) ? BigEndianness : LittleEndianness;
@@ -1062,9 +1111,9 @@ char *QUtf32::convertFromUnicode(char *out, QStringView in, QStringConverter::St
}
const QChar *uc = in.data();
- const QChar *end = in.data() + in.length();
+ const QChar *end = in.data() + in.size();
QChar ch;
- uint ucs4;
+ char32_t ucs4;
if (state->remainingChars == 1) {
auto character = state->state_data[Data];
Q_ASSERT(character <= 0xFFFF);
@@ -1147,7 +1196,7 @@ QChar *QUtf32::convertToUnicode(QChar *out, QByteArrayView in, QStringConverter:
if (state->flags & QStringConverter::Flag::ConvertInitialBom)
headerdone = true;
- int num = state->remainingChars;
+ qsizetype num = state->remainingChars;
state->remainingChars = 0;
if (!headerdone || endian == DetectEndianness || num) {
@@ -1165,7 +1214,7 @@ QChar *QUtf32::convertToUnicode(QChar *out, QByteArrayView in, QStringConverter:
endian = LittleEndianness;
}
}
- uint code = (endian == BigEndianness) ? qFromBigEndian<quint32>(tuple) : qFromLittleEndian<quint32>(tuple);
+ char32_t code = (endian == BigEndianness) ? qFromBigEndian<char32_t>(tuple) : qFromLittleEndian<char32_t>(tuple);
if (headerdone || code != QChar::ByteOrderMark) {
if (QChar::requiresSurrogates(code)) {
*out++ = QChar(QChar::highSurrogate(code));
@@ -1184,7 +1233,7 @@ QChar *QUtf32::convertToUnicode(QChar *out, QByteArrayView in, QStringConverter:
while (chars < end) {
tuple[num++] = *chars++;
if (num == 4) {
- uint code = (endian == BigEndianness) ? qFromBigEndian<quint32>(tuple) : qFromLittleEndian<quint32>(tuple);
+ char32_t code = (endian == BigEndianness) ? qFromBigEndian<char32_t>(tuple) : qFromLittleEndian<char32_t>(tuple);
for (char16_t c : QChar::fromUcs4(code))
*out++ = c;
num = 0;
@@ -1203,193 +1252,378 @@ QChar *QUtf32::convertToUnicode(QChar *out, QByteArrayView in, QStringConverter:
return out;
}
+#endif // !QT_BOOTSTRAPPED
#if defined(Q_OS_WIN) && !defined(QT_BOOTSTRAPPED)
-static QString convertToUnicodeCharByChar(QByteArrayView in, QStringConverter::State *state)
+int QLocal8Bit::checkUtf8()
{
- qsizetype length = in.size();
- const char *chars = in.data();
-
- Q_ASSERT(state);
- if (state->flags & QStringConverter::Flag::Stateless) // temporary
- state = nullptr;
-
- if (!chars || !length)
- return QString();
-
- int copyLocation = 0;
- int extra = 2;
- if (state && state->remainingChars) {
- copyLocation = state->remainingChars;
- extra += copyLocation;
- }
- qsizetype newLength = length + extra;
- char *mbcs = new char[newLength];
- //ensure that we have a NULL terminated string
- mbcs[newLength-1] = 0;
- mbcs[newLength-2] = 0;
- memcpy(&(mbcs[copyLocation]), chars, length);
- if (copyLocation) {
- //copy the last character from the state
- mbcs[0] = (char)state->state_data[0];
- state->remainingChars = 0;
- }
- const char *mb = mbcs;
- const char *next = 0;
- QString s;
- while ((next = CharNextExA(CP_ACP, mb, 0)) != mb) {
- wchar_t wc[2] ={0};
- int charlength = next - mb;
- int len = MultiByteToWideChar(CP_ACP, MB_PRECOMPOSED|MB_ERR_INVALID_CHARS, mb, charlength, wc, 2);
- if (len>0) {
- s.append(QChar(wc[0]));
- } else {
- int r = GetLastError();
- //check if the character being dropped is the last character
- if (r == ERROR_NO_UNICODE_TRANSLATION && mb == (mbcs+newLength -3) && state) {
- state->remainingChars = 1;
- state->state_data[0] = (char)*mb;
- }
- }
- mb = next;
- }
- delete [] mbcs;
- return s;
+ return GetACP() == CP_UTF8 ? 1 : -1;
}
-
-QString QLocal8Bit::convertToUnicode(QByteArrayView in, QStringConverter::State *state)
+QString QLocal8Bit::convertToUnicode_sys(QByteArrayView in, QStringConverter::State *state)
{
- qsizetype length = in.size();
+ return convertToUnicode_sys(in, CP_ACP, state);
+}
- Q_ASSERT(length < INT_MAX); // ### FIXME
+QString QLocal8Bit::convertToUnicode_sys(QByteArrayView in, quint32 codePage,
+ QStringConverter::State *state)
+{
const char *mb = in.data();
- int mblen = length;
+ qsizetype mblen = in.size();
+
+ Q_ASSERT(state);
+ qsizetype &invalidChars = state->invalidChars;
+ using Flag = QStringConverter::Flag;
+ const bool useNullForReplacement = !!(state->flags & Flag::ConvertInvalidToNull);
+ const char16_t replacementCharacter = useNullForReplacement ? QChar::Null
+ : QChar::ReplacementCharacter;
+ if (state->flags & Flag::Stateless) {
+ Q_ASSERT(state->remainingChars == 0);
+ state = nullptr;
+ }
if (!mb || !mblen)
return QString();
- QVarLengthArray<wchar_t, 4096> wc(4096);
- int len;
+ // Use a local stack-buffer at first to allow us a decently large container
+ // to avoid a lot of resizing, without also returning an overallocated
+ // QString to the user for small strings.
+ // Then we can be fast for small strings and take the hit of extra resizes
+ // and measuring how much storage is needed for large strings.
+ std::array<wchar_t, 4096> buf;
+ wchar_t *out = buf.data();
+ qsizetype outlen = buf.size();
+
QString sp;
- bool prepend = false;
- char state_data = 0;
- int remainingChars = 0;
-
- //save the current state information
- if (state) {
- state_data = (char)state->state_data[0];
- remainingChars = state->remainingChars;
- }
- //convert the pending character (if available)
- if (state && remainingChars) {
- char prev[3] = {0};
- prev[0] = state_data;
- prev[1] = mb[0];
- remainingChars = 0;
- len = MultiByteToWideChar(CP_ACP, MB_PRECOMPOSED,
- prev, 2, wc.data(), wc.length());
- if (len) {
- sp.append(QChar(wc[0]));
- if (mblen == 1) {
- state->remainingChars = 0;
- return sp;
- }
- prepend = true;
- mb++;
- mblen--;
- wc[0] = 0;
+ // Return a pointer to storage where we have enough space for `size`
+ const auto growOut = [&](qsizetype size) -> std::tuple<wchar_t *, qsizetype> {
+ if (outlen >= size)
+ return {out, outlen};
+ const bool wasStackBuffer = sp.isEmpty();
+ const auto begin = wasStackBuffer ? buf.data() : reinterpret_cast<wchar_t *>(sp.data());
+ const qsizetype offset = qsizetype(std::distance(begin, out));
+ qsizetype newSize = 0;
+ if (Q_UNLIKELY(qAddOverflow(offset, size, &newSize))) {
+ Q_CHECK_PTR(false);
+ return {nullptr, 0};
}
+ sp.resize(newSize);
+ auto it = reinterpret_cast<wchar_t *>(sp.data());
+ if (wasStackBuffer)
+ it = std::copy_n(buf.data(), offset, it);
+ else
+ it += offset;
+ return {it, size};
+ };
+
+ // Convert the pending characters (if available)
+ while (state && state->remainingChars && mblen) {
+ QStringConverter::State localState;
+ localState.flags = state->flags;
+ // Use at most 6 characters as a guess for the longest encoded character
+ // in any multibyte encoding.
+ // Even with a total of 2 bytes of overhead that would leave around
+ // 2^(4 * 8) possible characters
+ std::array<char, 6> prev = {0};
+ Q_ASSERT(state->remainingChars <= q20::ssize(state->state_data));
+ qsizetype index = 0;
+ for (; index < state->remainingChars; ++index)
+ prev[index] = state->state_data[index];
+ const qsizetype toCopy = std::min(q20::ssize(prev) - index, mblen);
+ for (qsizetype i = 0; i < toCopy; ++i, ++index)
+ prev[index] = mb[i];
+ mb += toCopy;
+ mblen -= toCopy;
+
+ // Recursing:
+ // Since we are using a clean local state it will try to decode what was
+ // stored in our state + some extra octets from input (`prev`). If some
+ // part fails we will have those characters stored in the local state's
+ // storage, and we can extract those. It may also output some
+ // replacement characters, which we'll count in the invalidChars.
+ // In the best case we only do this once, but we will loop until we have
+ // resolved all the remaining characters or we have run out of new input
+ // in which case we may still have remaining characters.
+ const QString tmp = convertToUnicode_sys(QByteArrayView(prev.data(), index), codePage,
+ &localState);
+ std::tie(out, outlen) = growOut(tmp.size());
+ if (!out)
+ return {};
+ out = std::copy_n(reinterpret_cast<const wchar_t *>(tmp.constData()), tmp.size(), out);
+ outlen -= tmp.size();
+ const qsizetype tail = toCopy - localState.remainingChars;
+ if (tail >= 0) {
+ // Everything left to process comes from `in`, so we can stop
+ // looping. Adjust the window for `in` and unset remainingChars to
+ // signal that we're done.
+ mb -= localState.remainingChars;
+ mblen += localState.remainingChars;
+ localState.remainingChars = 0;
+ }
+ state->remainingChars = localState.remainingChars;
+ state->invalidChars += localState.invalidChars;
+ std::copy_n(localState.state_data, state->remainingChars, state->state_data);
}
- while (!(len=MultiByteToWideChar(CP_ACP, MB_PRECOMPOSED|MB_ERR_INVALID_CHARS,
- mb, mblen, wc.data(), wc.length()))) {
- int r = GetLastError();
- if (r == ERROR_INSUFFICIENT_BUFFER) {
- const int wclen = MultiByteToWideChar(CP_ACP, MB_PRECOMPOSED,
- mb, mblen, 0, 0);
- wc.resize(wclen);
- } else if (r == ERROR_NO_UNICODE_TRANSLATION) {
- //find the last non NULL character
- while (mblen > 1 && !(mb[mblen-1]))
- mblen--;
- //check whether, we hit an invalid character in the middle
- if ((mblen <= 1) || (remainingChars && state_data))
- return convertToUnicodeCharByChar(in, state);
- //Remove the last character and try again...
- state_data = mb[mblen-1];
- remainingChars = 1;
- mblen--;
+ Q_ASSERT(!state || state->remainingChars == 0 || mblen == 0);
+
+ // Need it in this scope, since we try to decrease our window size if we
+ // encounter an error
+ int nextIn = qt_saturate<int>(mblen);
+ while (mblen > 0) {
+ std::tie(out, outlen) = growOut(1); // Need space for at least one character
+ if (!out)
+ return {};
+ const int nextOut = qt_saturate<int>(outlen);
+ int len = MultiByteToWideChar(codePage, MB_ERR_INVALID_CHARS, mb, nextIn, out, nextOut);
+ if (len) {
+ mb += nextIn;
+ mblen -= nextIn;
+ out += len;
+ outlen -= len;
} else {
- // Fail.
- qWarning("MultiByteToWideChar: Cannot convert multibyte text");
- break;
+ int r = GetLastError();
+ if (r == ERROR_INSUFFICIENT_BUFFER) {
+ const int wclen = MultiByteToWideChar(codePage, 0, mb, nextIn, 0, 0);
+ std::tie(out, outlen) = growOut(wclen);
+ if (!out)
+ return {};
+ } else if (r == ERROR_NO_UNICODE_TRANSLATION) {
+ // Can't decode the current window, so either store the state,
+ // reduce window size or output a replacement character.
+
+ // Check if we can store all remaining characters in the state
+ // to be used next time we're called:
+ if (state && mblen <= q20::ssize(state->state_data)) {
+ state->remainingChars = mblen;
+ std::copy_n(mb, mblen, state->state_data);
+ mb += mblen;
+ mblen = 0;
+ break;
+ }
+
+ // .. if not, try to find the last valid character in the window
+ // and try again with a shrunken window:
+ if (nextIn > 1) {
+ // There may be some incomplete data at the end of our current
+ // window, so decrease the window size and try again.
+ // In the worst case scenario there is gigs of undecodable
+ // garbage, but what are we supposed to do about that?
+ const auto it = CharPrevExA(codePage, mb, mb + nextIn, 0);
+ if (it != mb)
+ nextIn = int(it - mb);
+ else
+ --nextIn;
+ continue;
+ }
+
+ // Finally, we are forced to output a replacement character for
+ // the first byte in the window:
+ std::tie(out, outlen) = growOut(1);
+ if (!out)
+ return {};
+ *out = replacementCharacter;
+ ++invalidChars;
+ ++out;
+ --outlen;
+ ++mb;
+ --mblen;
+ } else {
+ // Fail.
+ qWarning("MultiByteToWideChar: Cannot convert multibyte text");
+ break;
+ }
}
+ nextIn = qt_saturate<int>(mblen);
}
- if (len <= 0)
- return QString();
+ if (sp.isEmpty()) {
+ // We must have only used the stack buffer
+ if (out != buf.data()) // else: we return null-string
+ sp = QStringView(buf.data(), out).toString();
+ } else{
+ const auto begin = reinterpret_cast<wchar_t *>(sp.data());
+ sp.truncate(std::distance(begin, out));
+ }
- if (wc[len-1] == 0) // len - 1: we don't want terminator
- --len;
+ if (sp.size() && sp.back().isNull())
+ sp.chop(1);
- //save the new state information
- if (state) {
- state->state_data[0] = (char)state_data;
- state->remainingChars = remainingChars;
- }
- QString s((QChar*)wc.data(), len);
- if (prepend) {
- return sp+s;
+ if (!state && mblen > 0) {
+ // We have trailing character(s) that could not be converted, and
+ // nowhere to cache them
+ sp.resize(sp.size() + mblen, replacementCharacter);
+ invalidChars += mblen;
}
- return s;
+ return sp;
}
-QByteArray QLocal8Bit::convertFromUnicode(QStringView in, QStringConverter::State *state)
+QByteArray QLocal8Bit::convertFromUnicode_sys(QStringView in, QStringConverter::State *state)
{
- const QChar *ch = in.data();
+ return convertFromUnicode_sys(in, CP_ACP, state);
+}
+
+QByteArray QLocal8Bit::convertFromUnicode_sys(QStringView in, quint32 codePage,
+ QStringConverter::State *state)
+{
+ const wchar_t *ch = reinterpret_cast<const wchar_t *>(in.data());
qsizetype uclen = in.size();
- Q_ASSERT(uclen < INT_MAX); // ### FIXME
Q_ASSERT(state);
- Q_UNUSED(state); // ### Fixme
- if (state->flags & QStringConverter::Flag::Stateless) // temporary
+ // The Windows API has a *boolean* out-parameter that says if a replacement
+ // character was used, but it gives us no way to know _how many_ were used.
+ // Since we cannot simply scan the string for replacement characters
+ // (which is potentially a question mark, and thus a valid character),
+ // we simply do not track the number of invalid characters here.
+ // auto &invalidChars = state->invalidChars;
+
+ using Flag = QStringConverter::Flag;
+ if (state->flags & Flag::Stateless) { // temporary
+ Q_ASSERT(state->remainingChars == 0);
state = nullptr;
+ }
if (!ch)
return QByteArray();
if (uclen == 0)
return QByteArray("");
- BOOL used_def;
- QByteArray mb(4096, 0);
- int len;
- while (!(len=WideCharToMultiByte(CP_ACP, 0, (const wchar_t*)ch, uclen,
- mb.data(), mb.size()-1, 0, &used_def)))
- {
- int r = GetLastError();
- if (r == ERROR_INSUFFICIENT_BUFFER) {
- mb.resize(1+WideCharToMultiByte(CP_ACP, 0,
- (const wchar_t*)ch, uclen,
- 0, 0, 0, &used_def));
- // and try again...
+
+ // Use a local stack-buffer at first to allow us a decently large container
+ // to avoid a lot of resizing, without also returning an overallocated
+ // QByteArray to the user for small strings.
+ // Then we can be fast for small strings and take the hit of extra resizes
+ // and measuring how much storage is needed for large strings.
+ std::array<char, 4096> buf;
+ char *out = buf.data();
+ qsizetype outlen = buf.size();
+ QByteArray mb;
+
+ if (state && state->remainingChars > 0) {
+ Q_ASSERT(state->remainingChars == 1);
+ // Let's try to decode the pending character
+ wchar_t wc[2] = { wchar_t(state->state_data[0]), ch[0] };
+ // Check if the second character is a valid low surrogate,
+ // otherwise we'll just decode the first character, for which windows
+ // will output a replacement character.
+ const bool validCodePoint = QChar::isLowSurrogate(wc[1]);
+ int len = WideCharToMultiByte(codePage, 0, wc, validCodePoint ? 2 : 1, out, outlen, nullptr,
+ nullptr);
+ if (!len)
+ return {}; // Cannot recover, and I refuse to believe it was a size limitation
+ out += len;
+ outlen -= len;
+ if (validCodePoint) {
+ ++ch;
+ --uclen;
+ }
+ state->remainingChars = 0;
+ state->state_data[0] = 0;
+ if (uclen == 0)
+ return QByteArrayView(buf.data(), len).toByteArray();
+ }
+
+ if (state && QChar::isHighSurrogate(ch[uclen - 1])) {
+ // We can handle a missing low surrogate at the end of the string,
+ // so if there is one, exclude it now and store it in the state.
+ state->remainingChars = 1;
+ state->state_data[0] = ch[uclen - 1];
+ --uclen;
+ if (uclen == 0)
+ return QByteArray();
+ }
+
+ Q_ASSERT(uclen > 0);
+
+ // Return a pointer to storage where we have enough space for `size`
+ const auto growOut = [&](qsizetype size) -> std::tuple<char *, qsizetype> {
+ if (outlen >= size)
+ return {out, outlen};
+ const bool wasStackBuffer = mb.isEmpty();
+ const auto begin = wasStackBuffer ? buf.data() : mb.data();
+ const qsizetype offset = qsizetype(std::distance(begin, out));
+ qsizetype newSize = 0;
+ if (Q_UNLIKELY(qAddOverflow(offset, size, &newSize))) {
+ Q_CHECK_PTR(false);
+ return {nullptr, 0};
+ }
+ mb.resize(newSize);
+ auto it = mb.data();
+ if (wasStackBuffer)
+ it = std::copy_n(buf.data(), offset, it);
+ else
+ it += offset;
+ return {it, size};
+ };
+
+ const auto getNextWindowSize = [&]() {
+ int nextIn = qt_saturate<int>(uclen);
+ // The Windows API has some issues if the current window ends in the
+ // middle of a surrogate pair, so we avoid that:
+ if (nextIn > 1 && QChar::isHighSurrogate(ch[nextIn - 1]))
+ --nextIn;
+ return nextIn;
+ };
+
+ int len = 0;
+ while (uclen > 0) {
+ const int nextIn = getNextWindowSize();
+ std::tie(out, outlen) = growOut(1); // We need at least one byte
+ if (!out)
+ return {};
+ const int nextOut = qt_saturate<int>(outlen);
+ len = WideCharToMultiByte(codePage, 0, ch, nextIn, out, nextOut, nullptr, nullptr);
+ if (len > 0) {
+ ch += nextIn;
+ uclen -= nextIn;
+ out += len;
+ outlen -= len;
} else {
- // Fail. Probably can't happen in fact (dwFlags is 0).
+ int r = GetLastError();
+ if (r == ERROR_INSUFFICIENT_BUFFER) {
+ int neededLength = WideCharToMultiByte(codePage, 0, ch, nextIn, nullptr, 0,
+ nullptr, nullptr);
+ if (neededLength <= 0) {
+ // Fail. Observed with UTF8 where the input window was max int and ended in an
+ // incomplete sequence, probably a Windows bug. We try to avoid that from
+ // happening by reducing the window size in that case. But let's keep this
+ // branch just in case of other bugs.
+#ifndef QT_NO_DEBUG
+ r = GetLastError();
+ fprintf(stderr,
+ "WideCharToMultiByte: Cannot convert multibyte text (error %d)\n", r);
+#endif // !QT_NO_DEBUG
+ break;
+ }
+ std::tie(out, outlen) = growOut(neededLength);
+ if (!out)
+ return {};
+ // and try again...
+ } else {
+ // Fail. Probably can't happen in fact (dwFlags is 0).
#ifndef QT_NO_DEBUG
- // Can't use qWarning(), as it'll recurse to handle %ls
- fprintf(stderr,
- "WideCharToMultiByte: Cannot convert multibyte text (error %d): %ls\n",
- r, reinterpret_cast<const wchar_t*>(QString(ch, uclen).utf16()));
+ // Can't use qWarning(), as it'll recurse to handle %ls
+ fprintf(stderr,
+ "WideCharToMultiByte: Cannot convert multibyte text (error %d): %ls\n", r,
+ reinterpret_cast<const wchar_t *>(
+ QStringView(ch, uclen).left(100).toString().utf16()));
#endif
- break;
+ break;
+ }
}
}
- mb.resize(len);
+ if (mb.isEmpty()) {
+ // We must have only used the stack buffer
+ if (out != buf.data()) // else: we return null-array
+ mb = QByteArrayView(buf.data(), out).toByteArray();
+ } else {
+ mb.truncate(std::distance(mb.data(), out));
+ }
return mb;
}
#endif
-void QStringConverter::State::clear()
+void QStringConverter::State::clear() noexcept
{
if (clearFn)
clearFn(this);
@@ -1400,6 +1634,22 @@ void QStringConverter::State::clear()
internalState = 0;
}
+void QStringConverter::State::reset() noexcept
+{
+ if (flags & Flag::UsesIcu) {
+#if QT_CONFIG(icu)
+ UConverter *converter = static_cast<UConverter *>(d[0]);
+ if (converter)
+ ucnv_reset(converter);
+#else
+ Q_UNREACHABLE();
+#endif
+ } else {
+ clear();
+ }
+}
+
+#ifndef QT_BOOTSTRAPPED
static QChar *fromUtf16(QChar *out, QByteArrayView in, QStringConverter::State *state)
{
return QUtf16::convertToUnicode(out, in, state, DetectEndianness);
@@ -1459,28 +1709,17 @@ static char *toUtf32LE(char *out, QStringView in, QStringConverter::State *state
{
return QUtf32::convertFromUnicode(out, in, state, LittleEndianness);
}
+#endif // !QT_BOOTSTRAPPED
-void qt_from_latin1(char16_t *dst, const char *str, size_t size) noexcept;
-
-static QChar *fromLatin1(QChar *out, QByteArrayView in, QStringConverter::State *state)
-{
- Q_ASSERT(state);
- Q_UNUSED(state);
-
- qt_from_latin1(reinterpret_cast<char16_t *>(out), in.data(), size_t(in.size()));
- return out + in.size();
-}
-
-
-static char *toLatin1(char *out, QStringView in, QStringConverter::State *state)
+char *QLatin1::convertFromUnicode(char *out, QStringView in, QStringConverter::State *state) noexcept
{
Q_ASSERT(state);
if (state->flags & QStringConverter::Flag::Stateless) // temporary
state = nullptr;
const char replacement = (state && state->flags & QStringConverter::Flag::ConvertInvalidToNull) ? 0 : '?';
- int invalid = 0;
- for (qsizetype i = 0; i < in.length(); ++i) {
+ qsizetype invalid = 0;
+ for (qsizetype i = 0; i < in.size(); ++i) {
if (in[i] > QChar(0xff)) {
*out = replacement;
++invalid;
@@ -1497,26 +1736,28 @@ static char *toLatin1(char *out, QStringView in, QStringConverter::State *state)
static QChar *fromLocal8Bit(QChar *out, QByteArrayView in, QStringConverter::State *state)
{
QString s = QLocal8Bit::convertToUnicode(in, state);
- memcpy(out, s.constData(), s.length()*sizeof(QChar));
- return out + s.length();
+ memcpy(out, s.constData(), s.size()*sizeof(QChar));
+ return out + s.size();
}
static char *toLocal8Bit(char *out, QStringView in, QStringConverter::State *state)
{
QByteArray s = QLocal8Bit::convertFromUnicode(in, state);
- memcpy(out, s.constData(), s.length());
- return out + s.length();
+ memcpy(out, s.constData(), s.size());
+ return out + s.size();
}
static qsizetype fromUtf8Len(qsizetype l) { return l + 1; }
static qsizetype toUtf8Len(qsizetype l) { return 3*(l + 1); }
+#ifndef QT_BOOTSTRAPPED
static qsizetype fromUtf16Len(qsizetype l) { return l/2 + 2; }
static qsizetype toUtf16Len(qsizetype l) { return 2*(l + 1); }
static qsizetype fromUtf32Len(qsizetype l) { return l/2 + 2; }
static qsizetype toUtf32Len(qsizetype l) { return 4*(l + 1); }
+#endif
static qsizetype fromLatin1Len(qsizetype l) { return l + 1; }
static qsizetype toLatin1Len(qsizetype l) { return l + 1; }
@@ -1621,6 +1862,7 @@ static qsizetype toLatin1Len(qsizetype l) { return l + 1; }
\value Stateless Ignore possible converter states between different function calls
to encode or decode strings. This will also cause the QStringConverter to raise an error if an incomplete
sequence of data is encountered.
+ \omitvalue UsesIcu
*/
/*!
@@ -1629,13 +1871,13 @@ static qsizetype toLatin1Len(qsizetype l) { return l + 1; }
\value Utf16 Create a converter to or from UTF-16. When decoding, the byte order will get automatically
detected by a leading byte order mark. If none exists or when encoding, the system byte order will
be assumed.
- \value Utf16BE Create a converter to or from big endian UTF-16.
- \value Utf16LE Create a converter to or from litte endian UTF-16.
+ \value Utf16BE Create a converter to or from big-endian UTF-16.
+ \value Utf16LE Create a converter to or from little-endian UTF-16.
\value Utf32 Create a converter to or from UTF-32. When decoding, the byte order will get automatically
detected by a leading byte order mark. If none exists or when encoding, the system byte order will
be assumed.
- \value Utf32BE Create a converter to or from big endian UTF-32.
- \value Utf32LE Create a converter to or from litte endian UTF-32.
+ \value Utf32BE Create a converter to or from big-endian UTF-32.
+ \value Utf32LE Create a converter to or from little-endian UTF-32.
\value Latin1 Create a converter to or from ISO-8859-1 (Latin1).
\value System Create a converter to or from the underlying encoding of the
operating systems locale. This is always assumed to be UTF-8 for Unix based
@@ -1651,34 +1893,31 @@ static qsizetype toLatin1Len(qsizetype l) { return l + 1; }
const QStringConverter::Interface QStringConverter::encodingInterfaces[QStringConverter::LastEncoding + 1] =
{
{ "UTF-8", QUtf8::convertToUnicode, fromUtf8Len, QUtf8::convertFromUnicode, toUtf8Len },
+#ifndef QT_BOOTSTRAPPED
{ "UTF-16", fromUtf16, fromUtf16Len, toUtf16, toUtf16Len },
{ "UTF-16LE", fromUtf16LE, fromUtf16Len, toUtf16LE, toUtf16Len },
{ "UTF-16BE", fromUtf16BE, fromUtf16Len, toUtf16BE, toUtf16Len },
{ "UTF-32", fromUtf32, fromUtf32Len, toUtf32, toUtf32Len },
{ "UTF-32LE", fromUtf32LE, fromUtf32Len, toUtf32LE, toUtf32Len },
{ "UTF-32BE", fromUtf32BE, fromUtf32Len, toUtf32BE, toUtf32Len },
- { "ISO-8859-1", fromLatin1, fromLatin1Len, toLatin1, toLatin1Len },
+#endif
+ { "ISO-8859-1", QLatin1::convertToUnicode, fromLatin1Len, QLatin1::convertFromUnicode, toLatin1Len },
{ "Locale", fromLocal8Bit, fromUtf8Len, toLocal8Bit, toUtf8Len }
};
// match names case insensitive and skipping '-' and '_'
static bool nameMatch(const char *a, const char *b)
{
- while (*a && *b) {
- if (*a == '-' || *a == '_') {
+ do {
+ while (*a == '-' || *a == '_')
++a;
- continue;
- }
- if (*b == '-' || *b == '_') {
+ while (*b == '-' || *b == '_')
++b;
- continue;
- }
- if (toupper(*a) != toupper(*b))
- return false;
- ++a;
- ++b;
- }
- return !*a && !*b;
+ if (!*a && !*b) // end of both strings
+ return true;
+ } while (QtMiscUtils::toAsciiLower(*a++) == QtMiscUtils::toAsciiLower(*b++));
+
+ return false;
}
@@ -1692,6 +1931,234 @@ static bool nameMatch(const char *a, const char *b)
\internal
*/
+
+#if QT_CONFIG(icu)
+// only derives from QStringConverter to get access to protected types
+struct QStringConverterICU : QStringConverter
+{
+ static void clear_function(QStringConverterBase::State *state) noexcept
+ {
+ ucnv_close(static_cast<UConverter *>(state->d[0]));
+ state->d[0] = nullptr;
+ }
+
+ static void ensureConverter(QStringConverter::State *state)
+ {
+ // old code might reset the state via clear instead of reset
+ // in that case, the converter has been closed, and we have to reopen it
+ if (state->d[0] == nullptr)
+ state->d[0] = createConverterForName(static_cast<const char *>(state->d[1]), state);
+ }
+
+ static QChar *toUtf16(QChar *out, QByteArrayView in, QStringConverter::State *state)
+ {
+ ensureConverter(state);
+
+ auto icu_conv = static_cast<UConverter *>(state->d[0]);
+ UErrorCode err = U_ZERO_ERROR;
+ auto source = in.data();
+ auto sourceLimit = in.data() + in.size();
+
+ qsizetype length = toLen(in.size());
+
+ UChar *target = reinterpret_cast<UChar *>(out);
+ auto targetLimit = target + length;
+ // We explicitly clean up anyway, so no need to set flush to true,
+ // which would just reset the converter.
+ UBool flush = false;
+
+ // If the QStringConverter was moved, the state that we used as a context is stale now.
+ UConverterToUCallback action;
+ const void *context;
+ ucnv_getToUCallBack(icu_conv, &action, &context);
+ if (context != state)
+ ucnv_setToUCallBack(icu_conv, action, &state, nullptr, nullptr, &err);
+
+ ucnv_toUnicode(icu_conv, &target, targetLimit, &source, sourceLimit, nullptr, flush, &err);
+ // We did reserve enough space:
+ Q_ASSERT(err != U_BUFFER_OVERFLOW_ERROR);
+ if (state->flags.testFlag(QStringConverter::Flag::Stateless)) {
+ if (auto leftOver = ucnv_toUCountPending(icu_conv, &err)) {
+ ucnv_reset(icu_conv);
+ state->invalidChars += leftOver;
+ }
+ }
+ return reinterpret_cast<QChar *>(target);
+ }
+
+ static char *fromUtf16(char *out, QStringView in, QStringConverter::State *state)
+ {
+ ensureConverter(state);
+ auto icu_conv = static_cast<UConverter *>(state->d[0]);
+ UErrorCode err = U_ZERO_ERROR;
+ auto source = reinterpret_cast<const UChar *>(in.data());
+ auto sourceLimit = reinterpret_cast<const UChar *>(in.data() + in.size());
+
+ qsizetype length = UCNV_GET_MAX_BYTES_FOR_STRING(in.size(), ucnv_getMaxCharSize(icu_conv));
+
+ char *target = out;
+ char *targetLimit = out + length;
+ UBool flush = false;
+
+ // If the QStringConverter was moved, the state that we used as a context is stale now.
+ UConverterFromUCallback action;
+ const void *context;
+ ucnv_getFromUCallBack(icu_conv, &action, &context);
+ if (context != state)
+ ucnv_setFromUCallBack(icu_conv, action, &state, nullptr, nullptr, &err);
+
+ ucnv_fromUnicode(icu_conv, &target, targetLimit, &source, sourceLimit, nullptr, flush, &err);
+ // We did reserve enough space:
+ Q_ASSERT(err != U_BUFFER_OVERFLOW_ERROR);
+ if (state->flags.testFlag(QStringConverter::Flag::Stateless)) {
+ if (auto leftOver = ucnv_fromUCountPending(icu_conv, &err)) {
+ ucnv_reset(icu_conv);
+ state->invalidChars += leftOver;
+ }
+ }
+ return target;
+ }
+
+ Q_DISABLE_COPY_MOVE(QStringConverterICU)
+
+ template<qsizetype X>
+ static qsizetype fromLen(qsizetype inLength)
+ {
+ return X * inLength * sizeof(UChar);
+ }
+
+ static qsizetype toLen(qsizetype inLength)
+ {
+
+ /* Assumption: each input char might map to a different codepoint
+ Each codepoint can take up to 4 bytes == 2 QChar
+ We can ignore reserving space for a BOM, as only UTF encodings use one
+ and those are not handled by the ICU converter.
+ */
+ return 2 * inLength;
+ }
+
+ static constexpr QStringConverter::Interface forLength[] = {
+ {"icu, recompile if you see this", QStringConverterICU::toUtf16, QStringConverterICU::toLen, QStringConverterICU::fromUtf16, QStringConverterICU::fromLen<1>},
+ {"icu, recompile if you see this", QStringConverterICU::toUtf16, QStringConverterICU::toLen, QStringConverterICU::fromUtf16, QStringConverterICU::fromLen<2>},
+ {"icu, recompile if you see this", QStringConverterICU::toUtf16, QStringConverterICU::toLen, QStringConverterICU::fromUtf16, QStringConverterICU::fromLen<3>},
+ {"icu, recompile if you see this", QStringConverterICU::toUtf16, QStringConverterICU::toLen, QStringConverterICU::fromUtf16, QStringConverterICU::fromLen<4>},
+ {"icu, recompile if you see this", QStringConverterICU::toUtf16, QStringConverterICU::toLen, QStringConverterICU::fromUtf16, QStringConverterICU::fromLen<5>},
+ {"icu, recompile if you see this", QStringConverterICU::toUtf16, QStringConverterICU::toLen, QStringConverterICU::fromUtf16, QStringConverterICU::fromLen<6>},
+ {"icu, recompile if you see this", QStringConverterICU::toUtf16, QStringConverterICU::toLen, QStringConverterICU::fromUtf16, QStringConverterICU::fromLen<7>},
+ {"icu, recompile if you see this", QStringConverterICU::toUtf16, QStringConverterICU::toLen, QStringConverterICU::fromUtf16, QStringConverterICU::fromLen<8>}
+ };
+
+ static UConverter *createConverterForName(const char *name, const State *state)
+ {
+ Q_ASSERT(name);
+ Q_ASSERT(state);
+ UErrorCode status = U_ZERO_ERROR;
+ UConverter *conv = ucnv_open(name, &status);
+ if (status != U_ZERO_ERROR && status != U_AMBIGUOUS_ALIAS_WARNING) {
+ ucnv_close(conv);
+ return nullptr;
+ }
+
+ if (state->flags.testFlag(Flag::ConvertInvalidToNull)) {
+ UErrorCode error = U_ZERO_ERROR;
+
+ auto nullToSubstituter = [](const void *context, UConverterToUnicodeArgs *toUArgs,
+ const char *, int32_t length,
+ UConverterCallbackReason reason, UErrorCode *err) {
+ if (reason <= UCNV_IRREGULAR) {
+ *err = U_ZERO_ERROR;
+ UChar c = '\0';
+ ucnv_cbToUWriteUChars(toUArgs, &c, 1, 0, err);
+ // Recover outer scope's state (which isn't const) from context:
+ auto state = const_cast<State *>(static_cast<const State *>(context));
+ state->invalidChars += length;
+ }
+ };
+ ucnv_setToUCallBack(conv, nullToSubstituter, state, nullptr, nullptr, &error);
+
+ auto nullFromSubstituter = [](const void *context, UConverterFromUnicodeArgs *fromUArgs,
+ const UChar *, int32_t length,
+ UChar32, UConverterCallbackReason reason, UErrorCode *err) {
+ if (reason <= UCNV_IRREGULAR) {
+ *err = U_ZERO_ERROR;
+ const UChar replacement[] = { 0 };
+ const UChar *stringBegin = std::begin(replacement);
+ ucnv_cbFromUWriteUChars(fromUArgs, &stringBegin, std::end(replacement), 0, err);
+ // Recover outer scope's state (which isn't const) from context:
+ auto state = const_cast<State *>(static_cast<const State *>(context));
+ state->invalidChars += length;
+ }
+ };
+ ucnv_setFromUCallBack(conv, nullFromSubstituter, state, nullptr, nullptr, &error);
+ } else {
+ UErrorCode error = U_ZERO_ERROR;
+
+ auto qmarkToSubstituter = [](const void *context, UConverterToUnicodeArgs *toUArgs,
+ const char *codeUnits,int32_t length,
+ UConverterCallbackReason reason, UErrorCode *err) {
+ if (reason <= UCNV_IRREGULAR) {
+ // Recover outer scope's state (which isn't const) from context:
+ auto state = const_cast<State *>(static_cast<const State *>(context));
+ state->invalidChars += length;
+ }
+ // use existing ICU callback for logic
+ UCNV_TO_U_CALLBACK_SUBSTITUTE(nullptr, toUArgs, codeUnits, length, reason, err);
+
+ };
+ ucnv_setToUCallBack(conv, qmarkToSubstituter, state, nullptr, nullptr, &error);
+
+ auto qmarkFromSubstituter = [](const void *context, UConverterFromUnicodeArgs *fromUArgs,
+ const UChar *codeUnits, int32_t length,
+ UChar32 codePoint, UConverterCallbackReason reason, UErrorCode *err) {
+ if (reason <= UCNV_IRREGULAR) {
+ // Recover outer scope's state (which isn't const) from context:
+ auto state = const_cast<State *>(static_cast<const State *>(context));
+ state->invalidChars += length;
+ }
+ // use existing ICU callback for logic
+ UCNV_FROM_U_CALLBACK_SUBSTITUTE(nullptr, fromUArgs, codeUnits, length,
+ codePoint, reason, err);
+ };
+ ucnv_setFromUCallBack(conv, qmarkFromSubstituter, state, nullptr, nullptr, &error);
+ }
+ return conv;
+ }
+
+ static const QStringConverter::Interface *make_icu_converter(
+ QStringConverterBase::State *state,
+ const char *name)
+ {
+ UErrorCode status = U_ZERO_ERROR;
+ UConverter *conv = createConverterForName(name, state);
+ if (!conv)
+ return nullptr;
+
+ const char *icuName = ucnv_getName(conv, &status);
+ // ucnv_getStandardName returns a name which is owned by the library
+ // we can thus store it in the state without worrying aobut its lifetime
+ const char *persistentName = ucnv_getStandardName(icuName, "MIME", &status);
+ if (U_FAILURE(status) || !persistentName) {
+ status = U_ZERO_ERROR;
+ persistentName = ucnv_getStandardName(icuName, "IANA", &status);
+ }
+ state->d[1] = const_cast<char *>(persistentName);
+ state->d[0] = conv;
+ state->flags |= QStringConverterBase::Flag::UsesIcu;
+ qsizetype maxCharSize = ucnv_getMaxCharSize(conv);
+ state->clearFn = QStringConverterICU::clear_function;
+ if (maxCharSize > 8 || maxCharSize < 1) {
+ qWarning("Encountered unexpected codec \"%s\" which requires >8x space", name);
+ return nullptr;
+ } else {
+ return &forLength[maxCharSize - 1];
+ }
+
+ }
+
+};
+#endif
+
/*!
\internal
*/
@@ -1700,7 +2167,27 @@ QStringConverter::QStringConverter(const char *name, Flags f)
{
auto e = encodingForName(name);
if (e)
- iface = encodingInterfaces + int(e.value());
+ iface = encodingInterfaces + int(*e);
+#if QT_CONFIG(icu)
+ else
+ iface = QStringConverterICU::make_icu_converter(&state, name);
+#endif
+}
+
+
+const char *QStringConverter::name() const noexcept
+{
+ if (!iface)
+ return nullptr;
+ if (state.flags & QStringConverter::Flag::UsesIcu) {
+#if QT_CONFIG(icu)
+ return static_cast<const char*>(state.d[1]);
+#else
+ return nullptr;
+#endif
+ } else {
+ return iface->name;
+ }
}
/*!
@@ -1733,17 +2220,26 @@ QStringConverter::QStringConverter(const char *name, Flags f)
Returns the canonical name of the encoding this QStringConverter can encode or decode.
Returns a nullptr if the converter is not valid.
+ The returned name is UTF-8 encoded.
\sa isValid()
*/
/*!
- Returns an optional encoding for \a name. The optional is empty if the name could
- not get converted to a valid encoding.
+ Convert \a name to the corresponding \l Encoding member, if there is one.
+
+ If the \a name is not the name of a codec listed in the Encoding enumeration,
+ \c{std::nullopt} is returned. Such a name may, none the less, be accepted by
+ the QStringConverter constructor when Qt is built with ICU, if ICU provides a
+ converter with the given name.
+
+ \a name is expected to be UTF-8 encoded.
*/
-std::optional<QStringConverter::Encoding> QStringConverter::encodingForName(const char *name)
+std::optional<QStringConverter::Encoding> QStringConverter::encodingForName(const char *name) noexcept
{
- for (int i = 0; i < LastEncoding + 1; ++i) {
+ if (!name)
+ return std::nullopt;
+ for (qsizetype i = 0; i < LastEncoding + 1; ++i) {
if (nameMatch(encodingInterfaces[i].name, name))
return QStringConverter::Encoding(i);
}
@@ -1752,6 +2248,7 @@ std::optional<QStringConverter::Encoding> QStringConverter::encodingForName(cons
return std::nullopt;
}
+#ifndef QT_BOOTSTRAPPED
/*!
Returns the encoding for the content of \a data if it can be determined.
\a expectedFirstCharacter can be passed as an additional hint to help determine
@@ -1759,15 +2256,16 @@ std::optional<QStringConverter::Encoding> QStringConverter::encodingForName(cons
The returned optional is empty, if the encoding is unclear.
*/
-std::optional<QStringConverter::Encoding> QStringConverter::encodingForData(QByteArrayView data, char16_t expectedFirstCharacter)
+std::optional<QStringConverter::Encoding>
+QStringConverter::encodingForData(QByteArrayView data, char16_t expectedFirstCharacter) noexcept
{
// someone set us up the BOM?
qsizetype arraySize = data.size();
if (arraySize > 3) {
- uint uc = qFromUnaligned<uint>(data.data());
- if (uc == qToBigEndian(uint(QChar::ByteOrderMark)))
+ char32_t uc = qFromUnaligned<char32_t>(data.data());
+ if (uc == qToBigEndian(char32_t(QChar::ByteOrderMark)))
return QStringConverter::Utf32BE;
- if (uc == qToLittleEndian(uint(QChar::ByteOrderMark)))
+ if (uc == qToLittleEndian(char32_t(QChar::ByteOrderMark)))
return QStringConverter::Utf32LE;
if (expectedFirstCharacter) {
// catch also anything starting with the expected character
@@ -1784,10 +2282,10 @@ std::optional<QStringConverter::Encoding> QStringConverter::encodingForData(QByt
}
if (arraySize > 1) {
- ushort uc = qFromUnaligned<ushort>(data.data());
- if (uc == qToBigEndian(ushort(QChar::ByteOrderMark)))
+ char16_t uc = qFromUnaligned<char16_t>(data.data());
+ if (uc == qToBigEndian(char16_t(QChar::ByteOrderMark)))
return QStringConverter::Utf16BE;
- if (uc == qToLittleEndian(ushort(QChar::ByteOrderMark)))
+ if (uc == qToLittleEndian(char16_t(QChar::ByteOrderMark)))
return QStringConverter::Utf16LE;
if (expectedFirstCharacter) {
// catch also anything starting with the expected character
@@ -1800,51 +2298,143 @@ std::optional<QStringConverter::Encoding> QStringConverter::encodingForData(QByt
return std::nullopt;
}
-/*!
- Tries to determine the encoding of the HTML in \a data by looking at leading byte
- order marks or a charset specifier in the HTML meta tag. If the optional is empty,
- the encoding specified is not supported by QStringConverter. If no encoding is
- detected, the method returns Utf8.
-*/
-std::optional<QStringConverter::Encoding> QStringConverter::encodingForHtml(QByteArrayView data)
+static QByteArray parseHtmlMetaForEncoding(QByteArrayView data)
{
- // determine charset
- auto encoding = encodingForData(data);
- if (encoding)
- // trust the initial BOM
- return encoding;
+ static constexpr auto metaSearcher = qMakeStaticByteArrayMatcher("meta ");
+ static constexpr auto charsetSearcher = qMakeStaticByteArrayMatcher("charset=");
QByteArray header = data.first(qMin(data.size(), qsizetype(1024))).toByteArray().toLower();
- int pos = header.indexOf("meta ");
+ qsizetype pos = metaSearcher.indexIn(header);
if (pos != -1) {
- pos = header.indexOf("charset=", pos);
+ pos = charsetSearcher.indexIn(header, pos);
if (pos != -1) {
- pos += int(qstrlen("charset="));
+ pos += qstrlen("charset=");
if (pos < header.size() && (header.at(pos) == '\"' || header.at(pos) == '\''))
++pos;
- int pos2 = pos;
+ qsizetype pos2 = pos;
// The attribute can be closed with either """, "'", ">" or "/",
// none of which are valid charset characters.
while (++pos2 < header.size()) {
char ch = header.at(pos2);
if (ch == '\"' || ch == '\'' || ch == '>' || ch == '/') {
QByteArray name = header.mid(pos, pos2 - pos);
- int colon = name.indexOf(':');
+ qsizetype colon = name.indexOf(':');
if (colon > 0)
name = name.left(colon);
name = name.simplified();
if (name == "unicode") // QTBUG-41998, ICU will return UTF-16.
name = QByteArrayLiteral("UTF-8");
if (!name.isEmpty())
- return encodingForName(name);
+ return name;
}
}
}
}
+ return QByteArray();
+}
+
+/*!
+ Tries to determine the encoding of the HTML in \a data by looking at leading byte
+ order marks or a charset specifier in the HTML meta tag. If the optional is empty,
+ the encoding specified is not supported by QStringConverter. If no encoding is
+ detected, the method returns Utf8.
+
+ \sa QStringDecoder::decoderForHtml()
+*/
+std::optional<QStringConverter::Encoding> QStringConverter::encodingForHtml(QByteArrayView data)
+{
+ // determine charset
+ std::optional<QStringConverter::Encoding> encoding = encodingForData(data);
+ if (encoding)
+ // trust the initial BOM
+ return encoding;
+
+ QByteArray encodingTag = parseHtmlMetaForEncoding(data);
+ if (!encodingTag.isEmpty())
+ return encodingForName(encodingTag);
+
return Utf8;
}
+static qsizetype availableCodecCount()
+{
+#if !QT_CONFIG(icu)
+ return QStringConverter::Encoding::LastEncoding;
+#else
+ /* icu contains also the names of what Qt provides
+ except for the special Locale one (so add one for it)
+ */
+ return 1 + ucnv_countAvailable();
+#endif
+}
+
+/*!
+ Returns a list of names of supported codecs. The names returned
+ by this function can be passed to QStringEncoder's and
+ QStringDecoder's constructor to create a en- or decoder for
+ the given codec.
+
+ \note The order of codecs is an internal implementation detail
+ and not guaranteed to be stable.
+ */
+QStringList QStringConverter::availableCodecs()
+{
+ auto availableCodec = [](qsizetype index) -> QString
+ {
+ #if !QT_CONFIG(icu)
+ return QString::fromLatin1(encodingInterfaces[index].name);
+ #else
+ if (index == 0) // "Locale", not provided by icu
+ return QString::fromLatin1(
+ encodingInterfaces[QStringConverter::Encoding::System].name);
+ // this mirrors the setup we do to set a converters name
+ UErrorCode status = U_ZERO_ERROR;
+ auto icuName = ucnv_getAvailableName(int32_t(index - 1));
+ const char *standardName = ucnv_getStandardName(icuName, "MIME", &status);
+ if (U_FAILURE(status) || !standardName) {
+ status = U_ZERO_ERROR;
+ standardName = ucnv_getStandardName(icuName, "IANA", &status);
+ }
+ if (!standardName)
+ standardName = icuName;
+ return QString::fromLatin1(standardName);
+ #endif
+ };
+
+ qsizetype codecCount = availableCodecCount();
+ QStringList result;
+ result.reserve(codecCount);
+ for (qsizetype i = 0; i < codecCount; ++i)
+ result.push_back(availableCodec(i));
+ return result;
+}
+
+/*!
+ Tries to determine the encoding of the HTML in \a data by looking at leading byte
+ order marks or a charset specifier in the HTML meta tag and returns a QStringDecoder
+ matching the encoding. If the returned decoder is not valid,
+ the encoding specified is not supported by QStringConverter. If no encoding is
+ detected, the method returns a decoder for Utf8.
+
+ \sa isValid()
+*/
+QStringDecoder QStringDecoder::decoderForHtml(QByteArrayView data)
+{
+ // determine charset
+ std::optional<QStringConverter::Encoding> encoding = encodingForData(data);
+ if (encoding)
+ // trust the initial BOM
+ return QStringDecoder(encoding.value());
+
+ QByteArray encodingTag = parseHtmlMetaForEncoding(data);
+ if (!encodingTag.isEmpty())
+ return QStringDecoder(encodingTag);
+
+ return QStringDecoder(Utf8);
+}
+#endif // !QT_BOOTSTRAPPED
+
/*!
Returns the canonical name for encoding \a e.
*/
@@ -1912,12 +2502,14 @@ const char *QStringConverter::nameForEncoding(QStringConverter::Encoding e)
*/
/*!
- \fn QByteArray QStringEncoder::encode(const QString &in)
- \fn QByteArray QStringEncoder::encode(QStringView in)
- \fn QByteArray QStringEncoder::operator()(const QString &in)
- \fn QByteArray QStringEncoder::operator()(QStringView in)
+ \fn QStringEncoder::DecodedData<const QString &> QStringEncoder::encode(const QString &in)
+ \fn QStringEncoder::DecodedData<QStringView> QStringEncoder::encode(QStringView in)
+ \fn QStringEncoder::DecodedData<const QString &> QStringEncoder::operator()(const QString &in)
+ \fn QStringEncoder::DecodedData<QStringView> QStringEncoder::operator()(QStringView in)
+
+ Converts \a in and returns a struct that is implicitly convertible to QByteArray.
- Converts \a in and returns the data as a byte array.
+ \snippet code/src_corelib_text_qstringconverter.cpp 5
*/
/*!
@@ -2001,12 +2593,15 @@ const char *QStringConverter::nameForEncoding(QStringConverter::Encoding e)
*/
/*!
- \fn QString QStringDecoder::operator()(const QByteArray &ba)
- \fn QString QStringDecoder::decode(const QByteArray &ba)
- \fn QString QStringDecoder::operator()(QByteArrayView ba)
- \fn QString QStringDecoder::decode(QByteArrayView ba)
+ \fn QStringDecoder::EncodedData<const QByteArray &> QStringDecoder::operator()(const QByteArray &ba)
+ \fn QStringDecoder::EncodedData<const QByteArray &> QStringDecoder::decode(const QByteArray &ba)
+ \fn QStringDecoder::EncodedData<QByteArrayView> QStringDecoder::operator()(QByteArrayView ba)
+ \fn QStringDecoder::EncodedData<QByteArrayView> QStringDecoder::decode(QByteArrayView ba)
+
+ Converts \a ba and returns a struct that is implicitly convertible to QString.
+
- Converts \a ba and returns the data as a QString.
+ \snippet code/src_corelib_text_qstringconverter.cpp 4
*/
/*!
@@ -2031,4 +2626,10 @@ const char *QStringConverter::nameForEncoding(QStringConverter::Encoding e)
\sa requiredSpace
*/
+/*!
+ \fn char16_t *QStringDecoder::appendToBuffer(char16_t *out, QByteArrayView in)
+ \since 6.6
+ \overload
+*/
+
QT_END_NAMESPACE