summaryrefslogtreecommitdiffstats
path: root/src/corelib/text/qstringconverter.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'src/corelib/text/qstringconverter.cpp')
-rw-r--r--src/corelib/text/qstringconverter.cpp1108
1 files changed, 866 insertions, 242 deletions
diff --git a/src/corelib/text/qstringconverter.cpp b/src/corelib/text/qstringconverter.cpp
index d252d7b667..efa625e30b 100644
--- a/src/corelib/text/qstringconverter.cpp
+++ b/src/corelib/text/qstringconverter.cpp
@@ -10,20 +10,35 @@
#include "private/qstringiterator_p.h"
#include "private/qtools_p.h"
#include "qbytearraymatcher.h"
+#include "qcontainertools_impl.h"
+#include <QtCore/qbytearraylist.h>
+
+#if QT_CONFIG(icu)
+#include <unicode/ucnv.h>
+#include <unicode/ucnv_cb.h>
+#include <unicode/ucnv_err.h>
+#include <unicode/ustring.h>
+#endif
#ifdef Q_OS_WIN
#include <qt_windows.h>
#ifndef QT_BOOTSTRAPPED
#include <QtCore/qvarlengtharray.h>
+#include <QtCore/q20iterator.h>
+#include <QtCore/private/qnumeric_p.h>
#endif // !QT_BOOTSTRAPPED
#endif
+#include <array>
+
#if __has_include(<bit>) && __cplusplus > 201703L
#include <bit>
#endif
QT_BEGIN_NAMESPACE
+using namespace QtMiscUtils;
+
static_assert(std::is_nothrow_move_constructible_v<QStringEncoder>);
static_assert(std::is_nothrow_move_assignable_v<QStringEncoder>);
static_assert(std::is_nothrow_move_constructible_v<QStringDecoder>);
@@ -33,8 +48,7 @@ enum { Endian = 0, Data = 1 };
static const uchar utf8bom[] = { 0xef, 0xbb, 0xbf };
-#if (defined(__SSE2__) && defined(QT_COMPILER_SUPPORTS_SSE2)) \
- || defined(__ARM_NEON__)
+#if defined(__SSE2__) || defined(__ARM_NEON__)
static Q_ALWAYS_INLINE uint qBitScanReverse(unsigned v) noexcept
{
#if defined(__cpp_lib_int_pow2) && __cpp_lib_int_pow2 >= 202002L
@@ -50,7 +64,7 @@ static Q_ALWAYS_INLINE uint qBitScanReverse(unsigned v) noexcept
}
#endif
-#if defined(__SSE2__) && defined(QT_COMPILER_SUPPORTS_SSE2)
+#if defined(__SSE2__)
static inline bool simdEncodeAscii(uchar *&dst, const char16_t *&nextAscii, const char16_t *&src, const char16_t *end)
{
// do sixteen characters at a time
@@ -187,14 +201,14 @@ static inline const uchar *simdFindNonAscii(const uchar *src, const uchar *end,
#ifdef __AVX2__
// do 32 characters at a time
// (this is similar to simdTestMask in qstring.cpp)
- const __m256i mask = _mm256_set1_epi8(0x80);
+ const __m256i mask = _mm256_set1_epi8(char(0x80));
for ( ; end - src >= 32; src += 32) {
__m256i data = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src));
if (_mm256_testz_si256(mask, data))
continue;
uint n = _mm256_movemask_epi8(data);
- Q_ASSUME(n);
+ Q_ASSERT(n);
// find the next probable ASCII character
// we don't want to load 32 bytes again in this loop if we know there are non-ASCII
@@ -244,7 +258,7 @@ static inline const uchar *simdFindNonAscii(const uchar *src, const uchar *end,
// Compare only the US-ASCII beginning of [src8, end8) and [src16, end16)
// and advance src8 and src16 to the first character that could not be compared
-static void simdCompareAscii(const char8_t *&src8, const char8_t *end8, const char16_t *&src16, const char16_t *end16)
+static void simdCompareAscii(const qchar8_t *&src8, const qchar8_t *end8, const char16_t *&src16, const char16_t *end16)
{
int bitSpacing = 1;
qptrdiff len = qMin(end8 - src8, end16 - src16);
@@ -430,7 +444,7 @@ static inline const uchar *simdFindNonAscii(const uchar *src, const uchar *end,
return src;
}
-static void simdCompareAscii(const char8_t *&, const char8_t *, const char16_t *&, const char16_t *)
+static void simdCompareAscii(const qchar8_t *&, const qchar8_t *, const char16_t *&, const char16_t *)
{
}
#else
@@ -450,7 +464,7 @@ static inline const uchar *simdFindNonAscii(const uchar *src, const uchar *end,
return src;
}
-static void simdCompareAscii(const char8_t *&, const char8_t *, const char16_t *&, const char16_t *)
+static void simdCompareAscii(const qchar8_t *&, const qchar8_t *, const char16_t *&, const char16_t *)
{
}
#endif
@@ -497,8 +511,7 @@ QByteArray QUtf8::convertFromUnicode(QStringView in, QStringConverterBase::State
char *QUtf8::convertFromUnicode(char *out, QStringView in, QStringConverter::State *state)
{
Q_ASSERT(state);
- const QChar *uc = in.data();
- qsizetype len = in.length();
+ qsizetype len = in.size();
if (!len)
return out;
@@ -515,7 +528,7 @@ char *QUtf8::convertFromUnicode(char *out, QStringView in, QStringConverter::Sta
};
uchar *cursor = reinterpret_cast<uchar *>(out);
- const char16_t *src = reinterpret_cast<const char16_t *>(uc);
+ const char16_t *src = in.utf16();
const char16_t *const end = src + len;
if (!(state->flags & QStringDecoder::Flag::Stateless)) {
@@ -565,6 +578,21 @@ char *QUtf8::convertFromUnicode(char *out, QStringView in, QStringConverter::Sta
return reinterpret_cast<char *>(cursor);
}
+char *QUtf8::convertFromLatin1(char *out, QLatin1StringView in)
+{
+ // ### SIMD-optimize:
+ for (uchar ch : in) {
+ if (ch < 128) {
+ *out++ = ch;
+ } else {
+ // as per https://en.wikipedia.org/wiki/UTF-8#Encoding, 2nd row
+ *out++ = 0b110'0'0000u | (ch >> 6);
+ *out++ = 0b10'00'0000u | (ch & 0b0011'1111);
+ }
+ }
+ return out;
+}
+
QString QUtf8::convertToUnicode(QByteArrayView in)
{
// UTF-8 to UTF-16 always needs the exact same number of words or less:
@@ -586,14 +614,14 @@ QString QUtf8::convertToUnicode(QByteArrayView in)
return result;
}
-/*!
- \since 5.7
+/*! \internal
+ \since 6.6
\overload
Converts the UTF-8 sequence of bytes viewed by \a in to a sequence of
- QChar starting at \a buffer. The buffer is expected to be large enough
- to hold the result. An upper bound for the size of the buffer is
- \c in.size() QChars.
+ QChar starting at \a dst in the destination buffer. The buffer is expected
+ to be large enough to hold the result. An upper bound for the size of the
+ buffer is \c in.size() QChars.
If, during decoding, an error occurs, a QChar::ReplacementCharacter is
written.
@@ -601,11 +629,12 @@ QString QUtf8::convertToUnicode(QByteArrayView in)
Returns a pointer to one past the last QChar written.
This function never throws.
-*/
-QChar *QUtf8::convertToUnicode(QChar *buffer, QByteArrayView in) noexcept
+ For QChar buffers, instead of casting manually, you can use the static
+ QUtf8::convertToUnicode(QChar *, QByteArrayView) directly.
+*/
+char16_t *QUtf8::convertToUnicode(char16_t *dst, QByteArrayView in) noexcept
{
- char16_t *dst = reinterpret_cast<char16_t *>(buffer);
const uchar *const start = reinterpret_cast<const uchar *>(in.data());
const uchar *src = start;
const uchar *end = src + in.size();
@@ -628,7 +657,7 @@ QChar *QUtf8::convertToUnicode(QChar *buffer, QByteArrayView in) noexcept
do {
uchar b = *src++;
- int res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(b, dst, src, end);
+ const qsizetype res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(b, dst, src, end);
if (res < 0) {
// decoding error
*dst++ = QChar::ReplacementCharacter;
@@ -637,7 +666,7 @@ QChar *QUtf8::convertToUnicode(QChar *buffer, QByteArrayView in) noexcept
}
}
- return reinterpret_cast<QChar *>(dst);
+ return dst;
}
QString QUtf8::convertToUnicode(QByteArrayView in, QStringConverter::State *state)
@@ -658,23 +687,22 @@ QString QUtf8::convertToUnicode(QByteArrayView in, QStringConverter::State *stat
return result;
}
-QChar *QUtf8::convertToUnicode(QChar *out, QByteArrayView in, QStringConverter::State *state)
+char16_t *QUtf8::convertToUnicode(char16_t *dst, QByteArrayView in, QStringConverter::State *state)
{
qsizetype len = in.size();
Q_ASSERT(state);
if (!len)
- return out;
+ return dst;
char16_t replacement = QChar::ReplacementCharacter;
if (state->flags & QStringConverter::Flag::ConvertInvalidToNull)
replacement = QChar::Null;
- int res;
+ qsizetype res;
uchar ch = 0;
- char16_t *dst = reinterpret_cast<char16_t *>(out);
const uchar *src = reinterpret_cast<const uchar *>(in.data());
const uchar *end = src + len;
@@ -702,7 +730,7 @@ QChar *QUtf8::convertToUnicode(QChar *out, QByteArrayView in, QStringConverter::
// copy to our state and return
state->remainingChars = remainingCharsCount + newCharsToCopy;
memcpy(&state->state_data[0], remainingCharsData, state->remainingChars);
- return out;
+ return dst;
} else if (!headerdone) {
// eat the UTF-8 BOM
if (dst[-1] == 0xfeff)
@@ -758,7 +786,7 @@ QChar *QUtf8::convertToUnicode(QChar *out, QByteArrayView in, QStringConverter::
state->remainingChars = 0;
}
- return reinterpret_cast<QChar *>(dst);
+ return dst;
}
struct QUtf8NoOutputTraits : public QUtf8BaseTraitsNoAscii
@@ -788,7 +816,7 @@ QUtf8::ValidUtf8Result QUtf8::isValidUtf8(QByteArrayView in)
isValidAscii = false;
QUtf8NoOutputTraits::NoOutput output;
- int res = QUtf8Functions::fromUtf8<QUtf8NoOutputTraits>(b, output, src, end);
+ const qsizetype res = QUtf8Functions::fromUtf8<QUtf8NoOutputTraits>(b, output, src, end);
if (res < 0) {
// decoding error
return { false, false };
@@ -799,9 +827,9 @@ QUtf8::ValidUtf8Result QUtf8::isValidUtf8(QByteArrayView in)
return { true, isValidAscii };
}
-int QUtf8::compareUtf8(QByteArrayView utf8, QStringView utf16) noexcept
+int QUtf8::compareUtf8(QByteArrayView utf8, QStringView utf16, Qt::CaseSensitivity cs) noexcept
{
- auto src1 = reinterpret_cast<const char8_t *>(utf8.data());
+ auto src1 = reinterpret_cast<const qchar8_t *>(utf8.data());
auto end1 = src1 + utf8.size();
auto src2 = reinterpret_cast<const char16_t *>(utf16.data());
auto end2 = src2 + utf16.size();
@@ -815,7 +843,7 @@ int QUtf8::compareUtf8(QByteArrayView utf8, QStringView utf16) noexcept
if (uc1 >= 0x80) {
char32_t *output = &uc1;
- int res = QUtf8Functions::fromUtf8<QUtf8BaseTraitsNoAscii>(uc1, output, src1, end1);
+ qsizetype res = QUtf8Functions::fromUtf8<QUtf8BaseTraitsNoAscii>(uc1, output, src1, end1);
if (res < 0) {
// decoding error
uc1 = QChar::ReplacementCharacter;
@@ -826,7 +854,10 @@ int QUtf8::compareUtf8(QByteArrayView utf8, QStringView utf16) noexcept
if (QChar::isHighSurrogate(uc2) && src2 < end2 && QChar::isLowSurrogate(*src2))
uc2 = QChar::surrogateToUcs4(uc2, *src2++);
}
-
+ if (cs == Qt::CaseInsensitive) {
+ uc1 = QChar::toCaseFolded(uc1);
+ uc2 = QChar::toCaseFolded(uc2);
+ }
if (uc1 != uc2)
return int(uc1) - int(uc2);
}
@@ -836,7 +867,7 @@ int QUtf8::compareUtf8(QByteArrayView utf8, QStringView utf16) noexcept
return (end1 > src1) - int(end2 > src2);
}
-int QUtf8::compareUtf8(QByteArrayView utf8, QLatin1StringView s)
+int QUtf8::compareUtf8(QByteArrayView utf8, QLatin1StringView s, Qt::CaseSensitivity cs)
{
char32_t uc1 = QChar::Null;
auto src1 = reinterpret_cast<const uchar *>(utf8.data());
@@ -847,13 +878,17 @@ int QUtf8::compareUtf8(QByteArrayView utf8, QLatin1StringView s)
while (src1 < end1 && src2 < end2) {
uchar b = *src1++;
char32_t *output = &uc1;
- int res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(b, output, src1, end1);
+ const qsizetype res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(b, output, src1, end1);
if (res < 0) {
// decoding error
uc1 = QChar::ReplacementCharacter;
}
char32_t uc2 = *src2++;
+ if (cs == Qt::CaseInsensitive) {
+ uc1 = QChar::toCaseFolded(uc1);
+ uc2 = QChar::toCaseFolded(uc2);
+ }
if (uc1 != uc2)
return int(uc1) - int(uc2);
}
@@ -862,6 +897,52 @@ int QUtf8::compareUtf8(QByteArrayView utf8, QLatin1StringView s)
return (end1 > src1) - (end2 > src2);
}
+int QUtf8::compareUtf8(QByteArrayView lhs, QByteArrayView rhs, Qt::CaseSensitivity cs) noexcept
+{
+ if (lhs.isEmpty())
+ return qt_lencmp(0, rhs.size());
+
+ if (cs == Qt::CaseSensitive) {
+ const auto l = std::min(lhs.size(), rhs.size());
+ int r = memcmp(lhs.data(), rhs.data(), l);
+ return r ? r : qt_lencmp(lhs.size(), rhs.size());
+ }
+
+ char32_t uc1 = QChar::Null;
+ auto src1 = reinterpret_cast<const uchar *>(lhs.data());
+ auto end1 = src1 + lhs.size();
+ char32_t uc2 = QChar::Null;
+ auto src2 = reinterpret_cast<const uchar *>(rhs.data());
+ auto end2 = src2 + rhs.size();
+
+ while (src1 < end1 && src2 < end2) {
+ uchar b = *src1++;
+ char32_t *output = &uc1;
+ qsizetype res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(b, output, src1, end1);
+ if (res < 0) {
+ // decoding error
+ uc1 = QChar::ReplacementCharacter;
+ }
+
+ b = *src2++;
+ output = &uc2;
+ res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(b, output, src2, end2);
+ if (res < 0) {
+ // decoding error
+ uc2 = QChar::ReplacementCharacter;
+ }
+
+ uc1 = QChar::toCaseFolded(uc1);
+ uc2 = QChar::toCaseFolded(uc2);
+ if (uc1 != uc2)
+ return int(uc1) - int(uc2);
+ }
+
+ // the shorter string sorts first
+ return (end1 > src1) - (end2 > src2);
+}
+
+#ifndef QT_BOOTSTRAPPED
QByteArray QUtf16::convertFromUnicode(QStringView in, QStringConverter::State *state, DataEndianness endian)
{
bool writeBom = !(state->internalState & HeaderDone) && state->flags & QStringConverter::Flag::WriteBom;
@@ -871,7 +952,7 @@ QByteArray QUtf16::convertFromUnicode(QStringView in, QStringConverter::State *s
QByteArray d(length, Qt::Uninitialized);
char *end = convertFromUnicode(d.data(), in, state, endian);
- Q_ASSERT(end - d.constData() == d.length());
+ Q_ASSERT(end - d.constData() == d.size());
Q_UNUSED(end);
return d;
}
@@ -894,13 +975,13 @@ char *QUtf16::convertFromUnicode(char *out, QStringView in, QStringConverter::St
out += 2;
}
if (endian == BigEndianness)
- qToBigEndian<char16_t>(in.data(), in.length(), out);
+ qToBigEndian<char16_t>(in.data(), in.size(), out);
else
- qToLittleEndian<char16_t>(in.data(), in.length(), out);
+ qToLittleEndian<char16_t>(in.data(), in.size(), out);
state->remainingChars = 0;
state->internalState |= HeaderDone;
- return out + 2*in.length();
+ return out + 2*in.size();
}
QString QUtf16::convertToUnicode(QByteArrayView in, QStringConverter::State *state, DataEndianness endian)
@@ -1030,7 +1111,7 @@ char *QUtf32::convertFromUnicode(char *out, QStringView in, QStringConverter::St
}
const QChar *uc = in.data();
- const QChar *end = in.data() + in.length();
+ const QChar *end = in.data() + in.size();
QChar ch;
char32_t ucs4;
if (state->remainingChars == 1) {
@@ -1171,6 +1252,7 @@ QChar *QUtf32::convertToUnicode(QChar *out, QByteArrayView in, QStringConverter:
return out;
}
+#endif // !QT_BOOTSTRAPPED
#if defined(Q_OS_WIN) && !defined(QT_BOOTSTRAPPED)
int QLocal8Bit::checkUtf8()
@@ -1178,186 +1260,365 @@ int QLocal8Bit::checkUtf8()
return GetACP() == CP_UTF8 ? 1 : -1;
}
-static QString convertToUnicodeCharByChar(QByteArrayView in, QStringConverter::State *state)
+QString QLocal8Bit::convertToUnicode_sys(QByteArrayView in, QStringConverter::State *state)
{
- qsizetype length = in.size();
- const char *chars = in.data();
-
- Q_ASSERT(state);
- if (state->flags & QStringConverter::Flag::Stateless) // temporary
- state = nullptr;
-
- if (!chars || !length)
- return QString();
-
- qsizetype copyLocation = 0;
- qsizetype extra = 2;
- if (state && state->remainingChars) {
- copyLocation = state->remainingChars;
- extra += copyLocation;
- }
- qsizetype newLength = length + extra;
- char *mbcs = new char[newLength];
- //ensure that we have a NULL terminated string
- mbcs[newLength-1] = 0;
- mbcs[newLength-2] = 0;
- memcpy(&(mbcs[copyLocation]), chars, length);
- if (copyLocation) {
- //copy the last character from the state
- mbcs[0] = (char)state->state_data[0];
- state->remainingChars = 0;
- }
- const char *mb = mbcs;
- const char *next = 0;
- QString s;
- while ((next = CharNextExA(CP_ACP, mb, 0)) != mb) {
- wchar_t wc[2] ={0};
- int charlength = next - mb;
- int len = MultiByteToWideChar(CP_ACP, MB_PRECOMPOSED|MB_ERR_INVALID_CHARS, mb, charlength, wc, 2);
- if (len>0) {
- s.append(QChar(wc[0]));
- } else {
- int r = GetLastError();
- //check if the character being dropped is the last character
- if (r == ERROR_NO_UNICODE_TRANSLATION && mb == (mbcs+newLength -3) && state) {
- state->remainingChars = 1;
- state->state_data[0] = (char)*mb;
- }
- }
- mb = next;
- }
- delete [] mbcs;
- return s;
+ return convertToUnicode_sys(in, CP_ACP, state);
}
-
-QString QLocal8Bit::convertToUnicode_sys(QByteArrayView in, QStringConverter::State *state)
+QString QLocal8Bit::convertToUnicode_sys(QByteArrayView in, quint32 codePage,
+ QStringConverter::State *state)
{
- qsizetype length = in.size();
-
- Q_ASSERT(length < INT_MAX); // ### FIXME
const char *mb = in.data();
- int mblen = length;
+ qsizetype mblen = in.size();
+
+ Q_ASSERT(state);
+ qsizetype &invalidChars = state->invalidChars;
+ using Flag = QStringConverter::Flag;
+ const bool useNullForReplacement = !!(state->flags & Flag::ConvertInvalidToNull);
+ const char16_t replacementCharacter = useNullForReplacement ? QChar::Null
+ : QChar::ReplacementCharacter;
+ if (state->flags & Flag::Stateless) {
+ Q_ASSERT(state->remainingChars == 0);
+ state = nullptr;
+ }
if (!mb || !mblen)
return QString();
- QVarLengthArray<wchar_t, 4096> wc(4096);
- int len;
+ // Use a local stack-buffer at first to allow us a decently large container
+ // to avoid a lot of resizing, without also returning an overallocated
+ // QString to the user for small strings.
+ // Then we can be fast for small strings and take the hit of extra resizes
+ // and measuring how much storage is needed for large strings.
+ std::array<wchar_t, 4096> buf;
+ wchar_t *out = buf.data();
+ qsizetype outlen = buf.size();
+
QString sp;
- bool prepend = false;
- char state_data = 0;
- int remainingChars = 0;
-
- //save the current state information
- if (state) {
- state_data = (char)state->state_data[0];
- remainingChars = state->remainingChars;
- }
- //convert the pending character (if available)
- if (state && remainingChars) {
- char prev[3] = {0};
- prev[0] = state_data;
- prev[1] = mb[0];
- remainingChars = 0;
- len = MultiByteToWideChar(CP_ACP, MB_PRECOMPOSED,
- prev, 2, wc.data(), wc.length());
- if (len) {
- sp.append(QChar(wc[0]));
- if (mblen == 1) {
- state->remainingChars = 0;
- return sp;
- }
- prepend = true;
- mb++;
- mblen--;
- wc[0] = 0;
+ // Return a pointer to storage where we have enough space for `size`
+ const auto growOut = [&](qsizetype size) -> std::tuple<wchar_t *, qsizetype> {
+ if (outlen >= size)
+ return {out, outlen};
+ const bool wasStackBuffer = sp.isEmpty();
+ const auto begin = wasStackBuffer ? buf.data() : reinterpret_cast<wchar_t *>(sp.data());
+ const qsizetype offset = qsizetype(std::distance(begin, out));
+ qsizetype newSize = 0;
+ if (Q_UNLIKELY(qAddOverflow(offset, size, &newSize))) {
+ Q_CHECK_PTR(false);
+ return {nullptr, 0};
+ }
+ sp.resize(newSize);
+ auto it = reinterpret_cast<wchar_t *>(sp.data());
+ if (wasStackBuffer)
+ it = std::copy_n(buf.data(), offset, it);
+ else
+ it += offset;
+ return {it, size};
+ };
+
+ // Convert the pending characters (if available)
+ while (state && state->remainingChars && mblen) {
+ QStringConverter::State localState;
+ localState.flags = state->flags;
+ // Use at most 6 characters as a guess for the longest encoded character
+ // in any multibyte encoding.
+ // Even with a total of 2 bytes of overhead that would leave around
+ // 2^(4 * 8) possible characters
+ std::array<char, 6> prev = {0};
+ Q_ASSERT(state->remainingChars <= q20::ssize(state->state_data));
+ qsizetype index = 0;
+ for (; index < state->remainingChars; ++index)
+ prev[index] = state->state_data[index];
+ const qsizetype toCopy = std::min(q20::ssize(prev) - index, mblen);
+ for (qsizetype i = 0; i < toCopy; ++i, ++index)
+ prev[index] = mb[i];
+ mb += toCopy;
+ mblen -= toCopy;
+
+ // Recursing:
+ // Since we are using a clean local state it will try to decode what was
+ // stored in our state + some extra octets from input (`prev`). If some
+ // part fails we will have those characters stored in the local state's
+ // storage, and we can extract those. It may also output some
+ // replacement characters, which we'll count in the invalidChars.
+ // In the best case we only do this once, but we will loop until we have
+ // resolved all the remaining characters or we have run out of new input
+ // in which case we may still have remaining characters.
+ const QString tmp = convertToUnicode_sys(QByteArrayView(prev.data(), index), codePage,
+ &localState);
+ std::tie(out, outlen) = growOut(tmp.size());
+ if (!out)
+ return {};
+ out = std::copy_n(reinterpret_cast<const wchar_t *>(tmp.constData()), tmp.size(), out);
+ outlen -= tmp.size();
+ const qsizetype tail = toCopy - localState.remainingChars;
+ if (tail >= 0) {
+ // Everything left to process comes from `in`, so we can stop
+ // looping. Adjust the window for `in` and unset remainingChars to
+ // signal that we're done.
+ mb -= localState.remainingChars;
+ mblen += localState.remainingChars;
+ localState.remainingChars = 0;
}
+ state->remainingChars = localState.remainingChars;
+ state->invalidChars += localState.invalidChars;
+ std::copy_n(localState.state_data, state->remainingChars, state->state_data);
}
- while (!(len=MultiByteToWideChar(CP_ACP, MB_PRECOMPOSED|MB_ERR_INVALID_CHARS,
- mb, mblen, wc.data(), wc.length()))) {
- int r = GetLastError();
- if (r == ERROR_INSUFFICIENT_BUFFER) {
- const int wclen = MultiByteToWideChar(CP_ACP, MB_PRECOMPOSED,
- mb, mblen, 0, 0);
- wc.resize(wclen);
- } else if (r == ERROR_NO_UNICODE_TRANSLATION) {
- //find the last non NULL character
- while (mblen > 1 && !(mb[mblen-1]))
- mblen--;
- //check whether, we hit an invalid character in the middle
- if ((mblen <= 1) || (remainingChars && state_data))
- return convertToUnicodeCharByChar(in, state);
- //Remove the last character and try again...
- state_data = mb[mblen-1];
- remainingChars = 1;
- mblen--;
+ Q_ASSERT(!state || state->remainingChars == 0 || mblen == 0);
+
+ // Need it in this scope, since we try to decrease our window size if we
+ // encounter an error
+ int nextIn = qt_saturate<int>(mblen);
+ while (mblen > 0) {
+ std::tie(out, outlen) = growOut(1); // Need space for at least one character
+ if (!out)
+ return {};
+ const int nextOut = qt_saturate<int>(outlen);
+ int len = MultiByteToWideChar(codePage, MB_ERR_INVALID_CHARS, mb, nextIn, out, nextOut);
+ if (len) {
+ mb += nextIn;
+ mblen -= nextIn;
+ out += len;
+ outlen -= len;
} else {
- // Fail.
- qWarning("MultiByteToWideChar: Cannot convert multibyte text");
- break;
+ int r = GetLastError();
+ if (r == ERROR_INSUFFICIENT_BUFFER) {
+ const int wclen = MultiByteToWideChar(codePage, 0, mb, nextIn, 0, 0);
+ std::tie(out, outlen) = growOut(wclen);
+ if (!out)
+ return {};
+ } else if (r == ERROR_NO_UNICODE_TRANSLATION) {
+ // Can't decode the current window, so either store the state,
+ // reduce window size or output a replacement character.
+
+ // Check if we can store all remaining characters in the state
+ // to be used next time we're called:
+ if (state && mblen <= q20::ssize(state->state_data)) {
+ state->remainingChars = mblen;
+ std::copy_n(mb, mblen, state->state_data);
+ mb += mblen;
+ mblen = 0;
+ break;
+ }
+
+ // .. if not, try to find the last valid character in the window
+ // and try again with a shrunken window:
+ if (nextIn > 1) {
+ // There may be some incomplete data at the end of our current
+ // window, so decrease the window size and try again.
+ // In the worst case scenario there is gigs of undecodable
+ // garbage, but what are we supposed to do about that?
+ const auto it = CharPrevExA(codePage, mb, mb + nextIn, 0);
+ if (it != mb)
+ nextIn = int(it - mb);
+ else
+ --nextIn;
+ continue;
+ }
+
+ // Finally, we are forced to output a replacement character for
+ // the first byte in the window:
+ std::tie(out, outlen) = growOut(1);
+ if (!out)
+ return {};
+ *out = replacementCharacter;
+ ++invalidChars;
+ ++out;
+ --outlen;
+ ++mb;
+ --mblen;
+ } else {
+ // Fail.
+ qWarning("MultiByteToWideChar: Cannot convert multibyte text");
+ break;
+ }
}
+ nextIn = qt_saturate<int>(mblen);
}
- if (len <= 0)
- return QString();
+ if (sp.isEmpty()) {
+ // We must have only used the stack buffer
+ if (out != buf.data()) // else: we return null-string
+ sp = QStringView(buf.data(), out).toString();
+ } else{
+ const auto begin = reinterpret_cast<wchar_t *>(sp.data());
+ sp.truncate(std::distance(begin, out));
+ }
- if (wc[len-1] == 0) // len - 1: we don't want terminator
- --len;
+ if (sp.size() && sp.back().isNull())
+ sp.chop(1);
- //save the new state information
- if (state) {
- state->state_data[0] = (char)state_data;
- state->remainingChars = remainingChars;
+ if (!state && mblen > 0) {
+ // We have trailing character(s) that could not be converted, and
+ // nowhere to cache them
+ sp.resize(sp.size() + mblen, replacementCharacter);
+ invalidChars += mblen;
}
- QString s((QChar*)wc.data(), len);
- if (prepend) {
- return sp+s;
- }
- return s;
+ return sp;
}
QByteArray QLocal8Bit::convertFromUnicode_sys(QStringView in, QStringConverter::State *state)
{
- const QChar *ch = in.data();
+ return convertFromUnicode_sys(in, CP_ACP, state);
+}
+
+QByteArray QLocal8Bit::convertFromUnicode_sys(QStringView in, quint32 codePage,
+ QStringConverter::State *state)
+{
+ const wchar_t *ch = reinterpret_cast<const wchar_t *>(in.data());
qsizetype uclen = in.size();
- Q_ASSERT(uclen < INT_MAX); // ### FIXME
Q_ASSERT(state);
- Q_UNUSED(state); // ### Fixme
- if (state->flags & QStringConverter::Flag::Stateless) // temporary
+ // The Windows API has a *boolean* out-parameter that says if a replacement
+ // character was used, but it gives us no way to know _how many_ were used.
+ // Since we cannot simply scan the string for replacement characters
+ // (which is potentially a question mark, and thus a valid character),
+ // we simply do not track the number of invalid characters here.
+ // auto &invalidChars = state->invalidChars;
+
+ using Flag = QStringConverter::Flag;
+ if (state->flags & Flag::Stateless) { // temporary
+ Q_ASSERT(state->remainingChars == 0);
state = nullptr;
+ }
if (!ch)
return QByteArray();
if (uclen == 0)
return QByteArray("");
- BOOL used_def;
- QByteArray mb(4096, 0);
- int len;
- while (!(len=WideCharToMultiByte(CP_ACP, 0, (const wchar_t*)ch, uclen,
- mb.data(), mb.size()-1, 0, &used_def)))
- {
- int r = GetLastError();
- if (r == ERROR_INSUFFICIENT_BUFFER) {
- mb.resize(1+WideCharToMultiByte(CP_ACP, 0,
- (const wchar_t*)ch, uclen,
- 0, 0, 0, &used_def));
- // and try again...
+
+ // Use a local stack-buffer at first to allow us a decently large container
+ // to avoid a lot of resizing, without also returning an overallocated
+ // QByteArray to the user for small strings.
+ // Then we can be fast for small strings and take the hit of extra resizes
+ // and measuring how much storage is needed for large strings.
+ std::array<char, 4096> buf;
+ char *out = buf.data();
+ qsizetype outlen = buf.size();
+ QByteArray mb;
+
+ if (state && state->remainingChars > 0) {
+ Q_ASSERT(state->remainingChars == 1);
+ // Let's try to decode the pending character
+ wchar_t wc[2] = { wchar_t(state->state_data[0]), ch[0] };
+ // Check if the second character is a valid low surrogate,
+ // otherwise we'll just decode the first character, for which windows
+ // will output a replacement character.
+ const bool validCodePoint = QChar::isLowSurrogate(wc[1]);
+ int len = WideCharToMultiByte(codePage, 0, wc, validCodePoint ? 2 : 1, out, outlen, nullptr,
+ nullptr);
+ if (!len)
+ return {}; // Cannot recover, and I refuse to believe it was a size limitation
+ out += len;
+ outlen -= len;
+ if (validCodePoint) {
+ ++ch;
+ --uclen;
+ }
+ state->remainingChars = 0;
+ state->state_data[0] = 0;
+ if (uclen == 0)
+ return QByteArrayView(buf.data(), len).toByteArray();
+ }
+
+ if (state && QChar::isHighSurrogate(ch[uclen - 1])) {
+ // We can handle a missing low surrogate at the end of the string,
+ // so if there is one, exclude it now and store it in the state.
+ state->remainingChars = 1;
+ state->state_data[0] = ch[uclen - 1];
+ --uclen;
+ if (uclen == 0)
+ return QByteArray();
+ }
+
+ Q_ASSERT(uclen > 0);
+
+ // Return a pointer to storage where we have enough space for `size`
+ const auto growOut = [&](qsizetype size) -> std::tuple<char *, qsizetype> {
+ if (outlen >= size)
+ return {out, outlen};
+ const bool wasStackBuffer = mb.isEmpty();
+ const auto begin = wasStackBuffer ? buf.data() : mb.data();
+ const qsizetype offset = qsizetype(std::distance(begin, out));
+ qsizetype newSize = 0;
+ if (Q_UNLIKELY(qAddOverflow(offset, size, &newSize))) {
+ Q_CHECK_PTR(false);
+ return {nullptr, 0};
+ }
+ mb.resize(newSize);
+ auto it = mb.data();
+ if (wasStackBuffer)
+ it = std::copy_n(buf.data(), offset, it);
+ else
+ it += offset;
+ return {it, size};
+ };
+
+ const auto getNextWindowSize = [&]() {
+ int nextIn = qt_saturate<int>(uclen);
+ // The Windows API has some issues if the current window ends in the
+ // middle of a surrogate pair, so we avoid that:
+ if (nextIn > 1 && QChar::isHighSurrogate(ch[nextIn - 1]))
+ --nextIn;
+ return nextIn;
+ };
+
+ int len = 0;
+ while (uclen > 0) {
+ const int nextIn = getNextWindowSize();
+ std::tie(out, outlen) = growOut(1); // We need at least one byte
+ if (!out)
+ return {};
+ const int nextOut = qt_saturate<int>(outlen);
+ len = WideCharToMultiByte(codePage, 0, ch, nextIn, out, nextOut, nullptr, nullptr);
+ if (len > 0) {
+ ch += nextIn;
+ uclen -= nextIn;
+ out += len;
+ outlen -= len;
} else {
- // Fail. Probably can't happen in fact (dwFlags is 0).
+ int r = GetLastError();
+ if (r == ERROR_INSUFFICIENT_BUFFER) {
+ int neededLength = WideCharToMultiByte(codePage, 0, ch, nextIn, nullptr, 0,
+ nullptr, nullptr);
+ if (neededLength <= 0) {
+ // Fail. Observed with UTF8 where the input window was max int and ended in an
+ // incomplete sequence, probably a Windows bug. We try to avoid that from
+ // happening by reducing the window size in that case. But let's keep this
+ // branch just in case of other bugs.
+#ifndef QT_NO_DEBUG
+ r = GetLastError();
+ fprintf(stderr,
+ "WideCharToMultiByte: Cannot convert multibyte text (error %d)\n", r);
+#endif // !QT_NO_DEBUG
+ break;
+ }
+ std::tie(out, outlen) = growOut(neededLength);
+ if (!out)
+ return {};
+ // and try again...
+ } else {
+ // Fail. Probably can't happen in fact (dwFlags is 0).
#ifndef QT_NO_DEBUG
- // Can't use qWarning(), as it'll recurse to handle %ls
- fprintf(stderr,
- "WideCharToMultiByte: Cannot convert multibyte text (error %d): %ls\n",
- r, reinterpret_cast<const wchar_t*>(QString(ch, uclen).utf16()));
+ // Can't use qWarning(), as it'll recurse to handle %ls
+ fprintf(stderr,
+ "WideCharToMultiByte: Cannot convert multibyte text (error %d): %ls\n", r,
+ reinterpret_cast<const wchar_t *>(
+ QStringView(ch, uclen).left(100).toString().utf16()));
#endif
- break;
+ break;
+ }
}
}
- mb.resize(len);
+ if (mb.isEmpty()) {
+ // We must have only used the stack buffer
+ if (out != buf.data()) // else: we return null-array
+ mb = QByteArrayView(buf.data(), out).toByteArray();
+ } else {
+ mb.truncate(std::distance(mb.data(), out));
+ }
return mb;
}
#endif
@@ -1373,6 +1634,22 @@ void QStringConverter::State::clear() noexcept
internalState = 0;
}
+void QStringConverter::State::reset() noexcept
+{
+ if (flags & Flag::UsesIcu) {
+#if QT_CONFIG(icu)
+ UConverter *converter = static_cast<UConverter *>(d[0]);
+ if (converter)
+ ucnv_reset(converter);
+#else
+ Q_UNREACHABLE();
+#endif
+ } else {
+ clear();
+ }
+}
+
+#ifndef QT_BOOTSTRAPPED
static QChar *fromUtf16(QChar *out, QByteArrayView in, QStringConverter::State *state)
{
return QUtf16::convertToUnicode(out, in, state, DetectEndianness);
@@ -1432,20 +1709,9 @@ static char *toUtf32LE(char *out, QStringView in, QStringConverter::State *state
{
return QUtf32::convertFromUnicode(out, in, state, LittleEndianness);
}
+#endif // !QT_BOOTSTRAPPED
-void qt_from_latin1(char16_t *dst, const char *str, size_t size) noexcept;
-
-static QChar *fromLatin1(QChar *out, QByteArrayView in, QStringConverter::State *state)
-{
- Q_ASSERT(state);
- Q_UNUSED(state);
-
- qt_from_latin1(reinterpret_cast<char16_t *>(out), in.data(), size_t(in.size()));
- return out + in.size();
-}
-
-
-static char *toLatin1(char *out, QStringView in, QStringConverter::State *state)
+char *QLatin1::convertFromUnicode(char *out, QStringView in, QStringConverter::State *state) noexcept
{
Q_ASSERT(state);
if (state->flags & QStringConverter::Flag::Stateless) // temporary
@@ -1453,7 +1719,7 @@ static char *toLatin1(char *out, QStringView in, QStringConverter::State *state)
const char replacement = (state && state->flags & QStringConverter::Flag::ConvertInvalidToNull) ? 0 : '?';
qsizetype invalid = 0;
- for (qsizetype i = 0; i < in.length(); ++i) {
+ for (qsizetype i = 0; i < in.size(); ++i) {
if (in[i] > QChar(0xff)) {
*out = replacement;
++invalid;
@@ -1470,26 +1736,28 @@ static char *toLatin1(char *out, QStringView in, QStringConverter::State *state)
static QChar *fromLocal8Bit(QChar *out, QByteArrayView in, QStringConverter::State *state)
{
QString s = QLocal8Bit::convertToUnicode(in, state);
- memcpy(out, s.constData(), s.length()*sizeof(QChar));
- return out + s.length();
+ memcpy(out, s.constData(), s.size()*sizeof(QChar));
+ return out + s.size();
}
static char *toLocal8Bit(char *out, QStringView in, QStringConverter::State *state)
{
QByteArray s = QLocal8Bit::convertFromUnicode(in, state);
- memcpy(out, s.constData(), s.length());
- return out + s.length();
+ memcpy(out, s.constData(), s.size());
+ return out + s.size();
}
static qsizetype fromUtf8Len(qsizetype l) { return l + 1; }
static qsizetype toUtf8Len(qsizetype l) { return 3*(l + 1); }
+#ifndef QT_BOOTSTRAPPED
static qsizetype fromUtf16Len(qsizetype l) { return l/2 + 2; }
static qsizetype toUtf16Len(qsizetype l) { return 2*(l + 1); }
static qsizetype fromUtf32Len(qsizetype l) { return l/2 + 2; }
static qsizetype toUtf32Len(qsizetype l) { return 4*(l + 1); }
+#endif
static qsizetype fromLatin1Len(qsizetype l) { return l + 1; }
static qsizetype toLatin1Len(qsizetype l) { return l + 1; }
@@ -1594,6 +1862,7 @@ static qsizetype toLatin1Len(qsizetype l) { return l + 1; }
\value Stateless Ignore possible converter states between different function calls
to encode or decode strings. This will also cause the QStringConverter to raise an error if an incomplete
sequence of data is encountered.
+ \omitvalue UsesIcu
*/
/*!
@@ -1624,34 +1893,31 @@ static qsizetype toLatin1Len(qsizetype l) { return l + 1; }
const QStringConverter::Interface QStringConverter::encodingInterfaces[QStringConverter::LastEncoding + 1] =
{
{ "UTF-8", QUtf8::convertToUnicode, fromUtf8Len, QUtf8::convertFromUnicode, toUtf8Len },
+#ifndef QT_BOOTSTRAPPED
{ "UTF-16", fromUtf16, fromUtf16Len, toUtf16, toUtf16Len },
{ "UTF-16LE", fromUtf16LE, fromUtf16Len, toUtf16LE, toUtf16Len },
{ "UTF-16BE", fromUtf16BE, fromUtf16Len, toUtf16BE, toUtf16Len },
{ "UTF-32", fromUtf32, fromUtf32Len, toUtf32, toUtf32Len },
{ "UTF-32LE", fromUtf32LE, fromUtf32Len, toUtf32LE, toUtf32Len },
{ "UTF-32BE", fromUtf32BE, fromUtf32Len, toUtf32BE, toUtf32Len },
- { "ISO-8859-1", fromLatin1, fromLatin1Len, toLatin1, toLatin1Len },
+#endif
+ { "ISO-8859-1", QLatin1::convertToUnicode, fromLatin1Len, QLatin1::convertFromUnicode, toLatin1Len },
{ "Locale", fromLocal8Bit, fromUtf8Len, toLocal8Bit, toUtf8Len }
};
// match names case insensitive and skipping '-' and '_'
static bool nameMatch(const char *a, const char *b)
{
- while (*a && *b) {
- if (*a == '-' || *a == '_') {
+ do {
+ while (*a == '-' || *a == '_')
++a;
- continue;
- }
- if (*b == '-' || *b == '_') {
+ while (*b == '-' || *b == '_')
++b;
- continue;
- }
- if (QtMiscUtils::toAsciiLower(*a) != QtMiscUtils::toAsciiLower(*b))
- return false;
- ++a;
- ++b;
- }
- return !*a && !*b;
+ if (!*a && !*b) // end of both strings
+ return true;
+ } while (QtMiscUtils::toAsciiLower(*a++) == QtMiscUtils::toAsciiLower(*b++));
+
+ return false;
}
@@ -1665,15 +1931,263 @@ static bool nameMatch(const char *a, const char *b)
\internal
*/
+
+#if QT_CONFIG(icu)
+// only derives from QStringConverter to get access to protected types
+struct QStringConverterICU : QStringConverter
+{
+ static void clear_function(QStringConverterBase::State *state) noexcept
+ {
+ ucnv_close(static_cast<UConverter *>(state->d[0]));
+ state->d[0] = nullptr;
+ }
+
+ static void ensureConverter(QStringConverter::State *state)
+ {
+ // old code might reset the state via clear instead of reset
+ // in that case, the converter has been closed, and we have to reopen it
+ if (state->d[0] == nullptr)
+ state->d[0] = createConverterForName(static_cast<const char *>(state->d[1]), state);
+ }
+
+ static QChar *toUtf16(QChar *out, QByteArrayView in, QStringConverter::State *state)
+ {
+ ensureConverter(state);
+
+ auto icu_conv = static_cast<UConverter *>(state->d[0]);
+ UErrorCode err = U_ZERO_ERROR;
+ auto source = in.data();
+ auto sourceLimit = in.data() + in.size();
+
+ qsizetype length = toLen(in.size());
+
+ UChar *target = reinterpret_cast<UChar *>(out);
+ auto targetLimit = target + length;
+ // We explicitly clean up anyway, so no need to set flush to true,
+ // which would just reset the converter.
+ UBool flush = false;
+
+ // If the QStringConverter was moved, the state that we used as a context is stale now.
+ UConverterToUCallback action;
+ const void *context;
+ ucnv_getToUCallBack(icu_conv, &action, &context);
+ if (context != state)
+ ucnv_setToUCallBack(icu_conv, action, &state, nullptr, nullptr, &err);
+
+ ucnv_toUnicode(icu_conv, &target, targetLimit, &source, sourceLimit, nullptr, flush, &err);
+ // We did reserve enough space:
+ Q_ASSERT(err != U_BUFFER_OVERFLOW_ERROR);
+ if (state->flags.testFlag(QStringConverter::Flag::Stateless)) {
+ if (auto leftOver = ucnv_toUCountPending(icu_conv, &err)) {
+ ucnv_reset(icu_conv);
+ state->invalidChars += leftOver;
+ }
+ }
+ return reinterpret_cast<QChar *>(target);
+ }
+
+ static char *fromUtf16(char *out, QStringView in, QStringConverter::State *state)
+ {
+ ensureConverter(state);
+ auto icu_conv = static_cast<UConverter *>(state->d[0]);
+ UErrorCode err = U_ZERO_ERROR;
+ auto source = reinterpret_cast<const UChar *>(in.data());
+ auto sourceLimit = reinterpret_cast<const UChar *>(in.data() + in.size());
+
+ qsizetype length = UCNV_GET_MAX_BYTES_FOR_STRING(in.size(), ucnv_getMaxCharSize(icu_conv));
+
+ char *target = out;
+ char *targetLimit = out + length;
+ UBool flush = false;
+
+ // If the QStringConverter was moved, the state that we used as a context is stale now.
+ UConverterFromUCallback action;
+ const void *context;
+ ucnv_getFromUCallBack(icu_conv, &action, &context);
+ if (context != state)
+ ucnv_setFromUCallBack(icu_conv, action, &state, nullptr, nullptr, &err);
+
+ ucnv_fromUnicode(icu_conv, &target, targetLimit, &source, sourceLimit, nullptr, flush, &err);
+ // We did reserve enough space:
+ Q_ASSERT(err != U_BUFFER_OVERFLOW_ERROR);
+ if (state->flags.testFlag(QStringConverter::Flag::Stateless)) {
+ if (auto leftOver = ucnv_fromUCountPending(icu_conv, &err)) {
+ ucnv_reset(icu_conv);
+ state->invalidChars += leftOver;
+ }
+ }
+ return target;
+ }
+
+ Q_DISABLE_COPY_MOVE(QStringConverterICU)
+
+ template<qsizetype X>
+ static qsizetype fromLen(qsizetype inLength)
+ {
+ return X * inLength * sizeof(UChar);
+ }
+
+ static qsizetype toLen(qsizetype inLength)
+ {
+
+ /* Assumption: each input char might map to a different codepoint
+ Each codepoint can take up to 4 bytes == 2 QChar
+ We can ignore reserving space for a BOM, as only UTF encodings use one
+ and those are not handled by the ICU converter.
+ */
+ return 2 * inLength;
+ }
+
+ static constexpr QStringConverter::Interface forLength[] = {
+ {"icu, recompile if you see this", QStringConverterICU::toUtf16, QStringConverterICU::toLen, QStringConverterICU::fromUtf16, QStringConverterICU::fromLen<1>},
+ {"icu, recompile if you see this", QStringConverterICU::toUtf16, QStringConverterICU::toLen, QStringConverterICU::fromUtf16, QStringConverterICU::fromLen<2>},
+ {"icu, recompile if you see this", QStringConverterICU::toUtf16, QStringConverterICU::toLen, QStringConverterICU::fromUtf16, QStringConverterICU::fromLen<3>},
+ {"icu, recompile if you see this", QStringConverterICU::toUtf16, QStringConverterICU::toLen, QStringConverterICU::fromUtf16, QStringConverterICU::fromLen<4>},
+ {"icu, recompile if you see this", QStringConverterICU::toUtf16, QStringConverterICU::toLen, QStringConverterICU::fromUtf16, QStringConverterICU::fromLen<5>},
+ {"icu, recompile if you see this", QStringConverterICU::toUtf16, QStringConverterICU::toLen, QStringConverterICU::fromUtf16, QStringConverterICU::fromLen<6>},
+ {"icu, recompile if you see this", QStringConverterICU::toUtf16, QStringConverterICU::toLen, QStringConverterICU::fromUtf16, QStringConverterICU::fromLen<7>},
+ {"icu, recompile if you see this", QStringConverterICU::toUtf16, QStringConverterICU::toLen, QStringConverterICU::fromUtf16, QStringConverterICU::fromLen<8>}
+ };
+
+ static UConverter *createConverterForName(const char *name, const State *state)
+ {
+ Q_ASSERT(name);
+ Q_ASSERT(state);
+ UErrorCode status = U_ZERO_ERROR;
+ UConverter *conv = ucnv_open(name, &status);
+ if (status != U_ZERO_ERROR && status != U_AMBIGUOUS_ALIAS_WARNING) {
+ ucnv_close(conv);
+ return nullptr;
+ }
+
+ if (state->flags.testFlag(Flag::ConvertInvalidToNull)) {
+ UErrorCode error = U_ZERO_ERROR;
+
+ auto nullToSubstituter = [](const void *context, UConverterToUnicodeArgs *toUArgs,
+ const char *, int32_t length,
+ UConverterCallbackReason reason, UErrorCode *err) {
+ if (reason <= UCNV_IRREGULAR) {
+ *err = U_ZERO_ERROR;
+ UChar c = '\0';
+ ucnv_cbToUWriteUChars(toUArgs, &c, 1, 0, err);
+ // Recover outer scope's state (which isn't const) from context:
+ auto state = const_cast<State *>(static_cast<const State *>(context));
+ state->invalidChars += length;
+ }
+ };
+ ucnv_setToUCallBack(conv, nullToSubstituter, state, nullptr, nullptr, &error);
+
+ auto nullFromSubstituter = [](const void *context, UConverterFromUnicodeArgs *fromUArgs,
+ const UChar *, int32_t length,
+ UChar32, UConverterCallbackReason reason, UErrorCode *err) {
+ if (reason <= UCNV_IRREGULAR) {
+ *err = U_ZERO_ERROR;
+ const UChar replacement[] = { 0 };
+ const UChar *stringBegin = std::begin(replacement);
+ ucnv_cbFromUWriteUChars(fromUArgs, &stringBegin, std::end(replacement), 0, err);
+ // Recover outer scope's state (which isn't const) from context:
+ auto state = const_cast<State *>(static_cast<const State *>(context));
+ state->invalidChars += length;
+ }
+ };
+ ucnv_setFromUCallBack(conv, nullFromSubstituter, state, nullptr, nullptr, &error);
+ } else {
+ UErrorCode error = U_ZERO_ERROR;
+
+ auto qmarkToSubstituter = [](const void *context, UConverterToUnicodeArgs *toUArgs,
+ const char *codeUnits,int32_t length,
+ UConverterCallbackReason reason, UErrorCode *err) {
+ if (reason <= UCNV_IRREGULAR) {
+ // Recover outer scope's state (which isn't const) from context:
+ auto state = const_cast<State *>(static_cast<const State *>(context));
+ state->invalidChars += length;
+ }
+ // use existing ICU callback for logic
+ UCNV_TO_U_CALLBACK_SUBSTITUTE(nullptr, toUArgs, codeUnits, length, reason, err);
+
+ };
+ ucnv_setToUCallBack(conv, qmarkToSubstituter, state, nullptr, nullptr, &error);
+
+ auto qmarkFromSubstituter = [](const void *context, UConverterFromUnicodeArgs *fromUArgs,
+ const UChar *codeUnits, int32_t length,
+ UChar32 codePoint, UConverterCallbackReason reason, UErrorCode *err) {
+ if (reason <= UCNV_IRREGULAR) {
+ // Recover outer scope's state (which isn't const) from context:
+ auto state = const_cast<State *>(static_cast<const State *>(context));
+ state->invalidChars += length;
+ }
+ // use existing ICU callback for logic
+ UCNV_FROM_U_CALLBACK_SUBSTITUTE(nullptr, fromUArgs, codeUnits, length,
+ codePoint, reason, err);
+ };
+ ucnv_setFromUCallBack(conv, qmarkFromSubstituter, state, nullptr, nullptr, &error);
+ }
+ return conv;
+ }
+
+ static const QStringConverter::Interface *make_icu_converter(
+ QStringConverterBase::State *state,
+ const char *name)
+ {
+ UErrorCode status = U_ZERO_ERROR;
+ UConverter *conv = createConverterForName(name, state);
+ if (!conv)
+ return nullptr;
+
+ const char *icuName = ucnv_getName(conv, &status);
+ // ucnv_getStandardName returns a name which is owned by the library
+ // we can thus store it in the state without worrying aobut its lifetime
+ const char *persistentName = ucnv_getStandardName(icuName, "MIME", &status);
+ if (U_FAILURE(status) || !persistentName) {
+ status = U_ZERO_ERROR;
+ persistentName = ucnv_getStandardName(icuName, "IANA", &status);
+ }
+ state->d[1] = const_cast<char *>(persistentName);
+ state->d[0] = conv;
+ state->flags |= QStringConverterBase::Flag::UsesIcu;
+ qsizetype maxCharSize = ucnv_getMaxCharSize(conv);
+ state->clearFn = QStringConverterICU::clear_function;
+ if (maxCharSize > 8 || maxCharSize < 1) {
+ qWarning("Encountered unexpected codec \"%s\" which requires >8x space", name);
+ return nullptr;
+ } else {
+ return &forLength[maxCharSize - 1];
+ }
+
+ }
+
+};
+#endif
+
/*!
\internal
*/
-QStringConverter::QStringConverter(const char *name, Flags f) noexcept
+QStringConverter::QStringConverter(const char *name, Flags f)
: iface(nullptr), state(f)
{
auto e = encodingForName(name);
if (e)
- iface = encodingInterfaces + int(e.value());
+ iface = encodingInterfaces + int(*e);
+#if QT_CONFIG(icu)
+ else
+ iface = QStringConverterICU::make_icu_converter(&state, name);
+#endif
+}
+
+
+const char *QStringConverter::name() const noexcept
+{
+ if (!iface)
+ return nullptr;
+ if (state.flags & QStringConverter::Flag::UsesIcu) {
+#if QT_CONFIG(icu)
+ return static_cast<const char*>(state.d[1]);
+#else
+ return nullptr;
+#endif
+ } else {
+ return iface->name;
+ }
}
/*!
@@ -1706,17 +2220,26 @@ QStringConverter::QStringConverter(const char *name, Flags f) noexcept
Returns the canonical name of the encoding this QStringConverter can encode or decode.
Returns a nullptr if the converter is not valid.
+ The returned name is UTF-8 encoded.
\sa isValid()
*/
/*!
- Returns an optional encoding for \a name. The optional is empty if the name could
- not get converted to a valid encoding.
+ Convert \a name to the corresponding \l Encoding member, if there is one.
+
+ If the \a name is not the name of a codec listed in the Encoding enumeration,
+ \c{std::nullopt} is returned. Such a name may, none the less, be accepted by
+ the QStringConverter constructor when Qt is built with ICU, if ICU provides a
+ converter with the given name.
+
+ \a name is expected to be UTF-8 encoded.
*/
std::optional<QStringConverter::Encoding> QStringConverter::encodingForName(const char *name) noexcept
{
- for (int i = 0; i < LastEncoding + 1; ++i) {
+ if (!name)
+ return std::nullopt;
+ for (qsizetype i = 0; i < LastEncoding + 1; ++i) {
if (nameMatch(encodingInterfaces[i].name, name))
return QStringConverter::Encoding(i);
}
@@ -1725,6 +2248,7 @@ std::optional<QStringConverter::Encoding> QStringConverter::encodingForName(cons
return std::nullopt;
}
+#ifndef QT_BOOTSTRAPPED
/*!
Returns the encoding for the content of \a data if it can be determined.
\a expectedFirstCharacter can be passed as an additional hint to help determine
@@ -1774,20 +2298,8 @@ QStringConverter::encodingForData(QByteArrayView data, char16_t expectedFirstCha
return std::nullopt;
}
-/*!
- Tries to determine the encoding of the HTML in \a data by looking at leading byte
- order marks or a charset specifier in the HTML meta tag. If the optional is empty,
- the encoding specified is not supported by QStringConverter. If no encoding is
- detected, the method returns Utf8.
-*/
-std::optional<QStringConverter::Encoding> QStringConverter::encodingForHtml(QByteArrayView data)
+static QByteArray parseHtmlMetaForEncoding(QByteArrayView data)
{
- // determine charset
- auto encoding = encodingForData(data);
- if (encoding)
- // trust the initial BOM
- return encoding;
-
static constexpr auto metaSearcher = qMakeStaticByteArrayMatcher("meta ");
static constexpr auto charsetSearcher = qMakeStaticByteArrayMatcher("charset=");
@@ -1796,7 +2308,7 @@ std::optional<QStringConverter::Encoding> QStringConverter::encodingForHtml(QByt
if (pos != -1) {
pos = charsetSearcher.indexIn(header, pos);
if (pos != -1) {
- pos += int(qstrlen("charset="));
+ pos += qstrlen("charset=");
if (pos < header.size() && (header.at(pos) == '\"' || header.at(pos) == '\''))
++pos;
@@ -1814,14 +2326,115 @@ std::optional<QStringConverter::Encoding> QStringConverter::encodingForHtml(QByt
if (name == "unicode") // QTBUG-41998, ICU will return UTF-16.
name = QByteArrayLiteral("UTF-8");
if (!name.isEmpty())
- return encodingForName(name);
+ return name;
}
}
}
}
+ return QByteArray();
+}
+
+/*!
+ Tries to determine the encoding of the HTML in \a data by looking at leading byte
+ order marks or a charset specifier in the HTML meta tag. If the optional is empty,
+ the encoding specified is not supported by QStringConverter. If no encoding is
+ detected, the method returns Utf8.
+
+ \sa QStringDecoder::decoderForHtml()
+*/
+std::optional<QStringConverter::Encoding> QStringConverter::encodingForHtml(QByteArrayView data)
+{
+ // determine charset
+ std::optional<QStringConverter::Encoding> encoding = encodingForData(data);
+ if (encoding)
+ // trust the initial BOM
+ return encoding;
+
+ QByteArray encodingTag = parseHtmlMetaForEncoding(data);
+ if (!encodingTag.isEmpty())
+ return encodingForName(encodingTag);
+
return Utf8;
}
+static qsizetype availableCodecCount()
+{
+#if !QT_CONFIG(icu)
+ return QStringConverter::Encoding::LastEncoding;
+#else
+ /* icu contains also the names of what Qt provides
+ except for the special Locale one (so add one for it)
+ */
+ return 1 + ucnv_countAvailable();
+#endif
+}
+
+/*!
+ Returns a list of names of supported codecs. The names returned
+ by this function can be passed to QStringEncoder's and
+ QStringDecoder's constructor to create a en- or decoder for
+ the given codec.
+
+ \note The order of codecs is an internal implementation detail
+ and not guaranteed to be stable.
+ */
+QStringList QStringConverter::availableCodecs()
+{
+ auto availableCodec = [](qsizetype index) -> QString
+ {
+ #if !QT_CONFIG(icu)
+ return QString::fromLatin1(encodingInterfaces[index].name);
+ #else
+ if (index == 0) // "Locale", not provided by icu
+ return QString::fromLatin1(
+ encodingInterfaces[QStringConverter::Encoding::System].name);
+ // this mirrors the setup we do to set a converters name
+ UErrorCode status = U_ZERO_ERROR;
+ auto icuName = ucnv_getAvailableName(int32_t(index - 1));
+ const char *standardName = ucnv_getStandardName(icuName, "MIME", &status);
+ if (U_FAILURE(status) || !standardName) {
+ status = U_ZERO_ERROR;
+ standardName = ucnv_getStandardName(icuName, "IANA", &status);
+ }
+ if (!standardName)
+ standardName = icuName;
+ return QString::fromLatin1(standardName);
+ #endif
+ };
+
+ qsizetype codecCount = availableCodecCount();
+ QStringList result;
+ result.reserve(codecCount);
+ for (qsizetype i = 0; i < codecCount; ++i)
+ result.push_back(availableCodec(i));
+ return result;
+}
+
+/*!
+ Tries to determine the encoding of the HTML in \a data by looking at leading byte
+ order marks or a charset specifier in the HTML meta tag and returns a QStringDecoder
+ matching the encoding. If the returned decoder is not valid,
+ the encoding specified is not supported by QStringConverter. If no encoding is
+ detected, the method returns a decoder for Utf8.
+
+ \sa isValid()
+*/
+QStringDecoder QStringDecoder::decoderForHtml(QByteArrayView data)
+{
+ // determine charset
+ std::optional<QStringConverter::Encoding> encoding = encodingForData(data);
+ if (encoding)
+ // trust the initial BOM
+ return QStringDecoder(encoding.value());
+
+ QByteArray encodingTag = parseHtmlMetaForEncoding(data);
+ if (!encodingTag.isEmpty())
+ return QStringDecoder(encodingTag);
+
+ return QStringDecoder(Utf8);
+}
+#endif // !QT_BOOTSTRAPPED
+
/*!
Returns the canonical name for encoding \a e.
*/
@@ -1889,12 +2502,14 @@ const char *QStringConverter::nameForEncoding(QStringConverter::Encoding e)
*/
/*!
- \fn QByteArray QStringEncoder::encode(const QString &in)
- \fn QByteArray QStringEncoder::encode(QStringView in)
- \fn QByteArray QStringEncoder::operator()(const QString &in)
- \fn QByteArray QStringEncoder::operator()(QStringView in)
+ \fn QStringEncoder::DecodedData<const QString &> QStringEncoder::encode(const QString &in)
+ \fn QStringEncoder::DecodedData<QStringView> QStringEncoder::encode(QStringView in)
+ \fn QStringEncoder::DecodedData<const QString &> QStringEncoder::operator()(const QString &in)
+ \fn QStringEncoder::DecodedData<QStringView> QStringEncoder::operator()(QStringView in)
+
+ Converts \a in and returns a struct that is implicitly convertible to QByteArray.
- Converts \a in and returns the data as a byte array.
+ \snippet code/src_corelib_text_qstringconverter.cpp 5
*/
/*!
@@ -1978,12 +2593,15 @@ const char *QStringConverter::nameForEncoding(QStringConverter::Encoding e)
*/
/*!
- \fn QString QStringDecoder::operator()(const QByteArray &ba)
- \fn QString QStringDecoder::decode(const QByteArray &ba)
- \fn QString QStringDecoder::operator()(QByteArrayView ba)
- \fn QString QStringDecoder::decode(QByteArrayView ba)
+ \fn QStringDecoder::EncodedData<const QByteArray &> QStringDecoder::operator()(const QByteArray &ba)
+ \fn QStringDecoder::EncodedData<const QByteArray &> QStringDecoder::decode(const QByteArray &ba)
+ \fn QStringDecoder::EncodedData<QByteArrayView> QStringDecoder::operator()(QByteArrayView ba)
+ \fn QStringDecoder::EncodedData<QByteArrayView> QStringDecoder::decode(QByteArrayView ba)
+
+ Converts \a ba and returns a struct that is implicitly convertible to QString.
+
- Converts \a ba and returns the data as a QString.
+ \snippet code/src_corelib_text_qstringconverter.cpp 4
*/
/*!
@@ -2008,4 +2626,10 @@ const char *QStringConverter::nameForEncoding(QStringConverter::Encoding e)
\sa requiredSpace
*/
+/*!
+ \fn char16_t *QStringDecoder::appendToBuffer(char16_t *out, QByteArrayView in)
+ \since 6.6
+ \overload
+*/
+
QT_END_NAMESPACE