diff options
Diffstat (limited to 'src/corelib/text/qstringconverter.cpp')
-rw-r--r-- | src/corelib/text/qstringconverter.cpp | 782 |
1 files changed, 562 insertions, 220 deletions
diff --git a/src/corelib/text/qstringconverter.cpp b/src/corelib/text/qstringconverter.cpp index 443dc8fce9..67c75d708e 100644 --- a/src/corelib/text/qstringconverter.cpp +++ b/src/corelib/text/qstringconverter.cpp @@ -10,6 +10,8 @@ #include "private/qstringiterator_p.h" #include "private/qtools_p.h" #include "qbytearraymatcher.h" +#include "qcontainertools_impl.h" +#include <QtCore/qbytearraylist.h> #if QT_CONFIG(icu) #include <unicode/ucnv.h> @@ -22,15 +24,21 @@ #include <qt_windows.h> #ifndef QT_BOOTSTRAPPED #include <QtCore/qvarlengtharray.h> +#include <QtCore/q20iterator.h> +#include <QtCore/private/qnumeric_p.h> #endif // !QT_BOOTSTRAPPED #endif +#include <array> + #if __has_include(<bit>) && __cplusplus > 201703L #include <bit> #endif QT_BEGIN_NAMESPACE +using namespace QtMiscUtils; + static_assert(std::is_nothrow_move_constructible_v<QStringEncoder>); static_assert(std::is_nothrow_move_assignable_v<QStringEncoder>); static_assert(std::is_nothrow_move_constructible_v<QStringDecoder>); @@ -193,14 +201,14 @@ static inline const uchar *simdFindNonAscii(const uchar *src, const uchar *end, #ifdef __AVX2__ // do 32 characters at a time // (this is similar to simdTestMask in qstring.cpp) - const __m256i mask = _mm256_set1_epi8(0x80); + const __m256i mask = _mm256_set1_epi8(char(0x80)); for ( ; end - src >= 32; src += 32) { __m256i data = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src)); if (_mm256_testz_si256(mask, data)) continue; uint n = _mm256_movemask_epi8(data); - Q_ASSUME(n); + Q_ASSERT(n); // find the next probable ASCII character // we don't want to load 32 bytes again in this loop if we know there are non-ASCII @@ -503,8 +511,7 @@ QByteArray QUtf8::convertFromUnicode(QStringView in, QStringConverterBase::State char *QUtf8::convertFromUnicode(char *out, QStringView in, QStringConverter::State *state) { Q_ASSERT(state); - const QChar *uc = in.data(); - qsizetype len = in.length(); + qsizetype len = in.size(); if (!len) return out; @@ -521,7 +528,7 @@ char *QUtf8::convertFromUnicode(char *out, QStringView in, QStringConverter::Sta }; uchar *cursor = reinterpret_cast<uchar *>(out); - const char16_t *src = reinterpret_cast<const char16_t *>(uc); + const char16_t *src = in.utf16(); const char16_t *const end = src + len; if (!(state->flags & QStringDecoder::Flag::Stateless)) { @@ -571,6 +578,21 @@ char *QUtf8::convertFromUnicode(char *out, QStringView in, QStringConverter::Sta return reinterpret_cast<char *>(cursor); } +char *QUtf8::convertFromLatin1(char *out, QLatin1StringView in) +{ + // ### SIMD-optimize: + for (uchar ch : in) { + if (ch < 128) { + *out++ = ch; + } else { + // as per https://en.wikipedia.org/wiki/UTF-8#Encoding, 2nd row + *out++ = 0b110'0'0000u | (ch >> 6); + *out++ = 0b10'00'0000u | (ch & 0b0011'1111); + } + } + return out; +} + QString QUtf8::convertToUnicode(QByteArrayView in) { // UTF-8 to UTF-16 always needs the exact same number of words or less: @@ -592,14 +614,14 @@ QString QUtf8::convertToUnicode(QByteArrayView in) return result; } -/*! - \since 5.7 +/*! \internal + \since 6.6 \overload Converts the UTF-8 sequence of bytes viewed by \a in to a sequence of - QChar starting at \a buffer. The buffer is expected to be large enough - to hold the result. An upper bound for the size of the buffer is - \c in.size() QChars. + QChar starting at \a dst in the destination buffer. The buffer is expected + to be large enough to hold the result. An upper bound for the size of the + buffer is \c in.size() QChars. If, during decoding, an error occurs, a QChar::ReplacementCharacter is written. @@ -607,11 +629,12 @@ QString QUtf8::convertToUnicode(QByteArrayView in) Returns a pointer to one past the last QChar written. This function never throws. -*/ -QChar *QUtf8::convertToUnicode(QChar *buffer, QByteArrayView in) noexcept + For QChar buffers, instead of casting manually, you can use the static + QUtf8::convertToUnicode(QChar *, QByteArrayView) directly. +*/ +char16_t *QUtf8::convertToUnicode(char16_t *dst, QByteArrayView in) noexcept { - char16_t *dst = reinterpret_cast<char16_t *>(buffer); const uchar *const start = reinterpret_cast<const uchar *>(in.data()); const uchar *src = start; const uchar *end = src + in.size(); @@ -634,7 +657,7 @@ QChar *QUtf8::convertToUnicode(QChar *buffer, QByteArrayView in) noexcept do { uchar b = *src++; - int res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(b, dst, src, end); + const qsizetype res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(b, dst, src, end); if (res < 0) { // decoding error *dst++ = QChar::ReplacementCharacter; @@ -643,7 +666,7 @@ QChar *QUtf8::convertToUnicode(QChar *buffer, QByteArrayView in) noexcept } } - return reinterpret_cast<QChar *>(dst); + return dst; } QString QUtf8::convertToUnicode(QByteArrayView in, QStringConverter::State *state) @@ -664,23 +687,22 @@ QString QUtf8::convertToUnicode(QByteArrayView in, QStringConverter::State *stat return result; } -QChar *QUtf8::convertToUnicode(QChar *out, QByteArrayView in, QStringConverter::State *state) +char16_t *QUtf8::convertToUnicode(char16_t *dst, QByteArrayView in, QStringConverter::State *state) { qsizetype len = in.size(); Q_ASSERT(state); if (!len) - return out; + return dst; char16_t replacement = QChar::ReplacementCharacter; if (state->flags & QStringConverter::Flag::ConvertInvalidToNull) replacement = QChar::Null; - int res; + qsizetype res; uchar ch = 0; - char16_t *dst = reinterpret_cast<char16_t *>(out); const uchar *src = reinterpret_cast<const uchar *>(in.data()); const uchar *end = src + len; @@ -708,7 +730,7 @@ QChar *QUtf8::convertToUnicode(QChar *out, QByteArrayView in, QStringConverter:: // copy to our state and return state->remainingChars = remainingCharsCount + newCharsToCopy; memcpy(&state->state_data[0], remainingCharsData, state->remainingChars); - return out; + return dst; } else if (!headerdone) { // eat the UTF-8 BOM if (dst[-1] == 0xfeff) @@ -764,7 +786,7 @@ QChar *QUtf8::convertToUnicode(QChar *out, QByteArrayView in, QStringConverter:: state->remainingChars = 0; } - return reinterpret_cast<QChar *>(dst); + return dst; } struct QUtf8NoOutputTraits : public QUtf8BaseTraitsNoAscii @@ -794,7 +816,7 @@ QUtf8::ValidUtf8Result QUtf8::isValidUtf8(QByteArrayView in) isValidAscii = false; QUtf8NoOutputTraits::NoOutput output; - int res = QUtf8Functions::fromUtf8<QUtf8NoOutputTraits>(b, output, src, end); + const qsizetype res = QUtf8Functions::fromUtf8<QUtf8NoOutputTraits>(b, output, src, end); if (res < 0) { // decoding error return { false, false }; @@ -805,7 +827,7 @@ QUtf8::ValidUtf8Result QUtf8::isValidUtf8(QByteArrayView in) return { true, isValidAscii }; } -int QUtf8::compareUtf8(QByteArrayView utf8, QStringView utf16) noexcept +int QUtf8::compareUtf8(QByteArrayView utf8, QStringView utf16, Qt::CaseSensitivity cs) noexcept { auto src1 = reinterpret_cast<const qchar8_t *>(utf8.data()); auto end1 = src1 + utf8.size(); @@ -821,7 +843,7 @@ int QUtf8::compareUtf8(QByteArrayView utf8, QStringView utf16) noexcept if (uc1 >= 0x80) { char32_t *output = &uc1; - int res = QUtf8Functions::fromUtf8<QUtf8BaseTraitsNoAscii>(uc1, output, src1, end1); + qsizetype res = QUtf8Functions::fromUtf8<QUtf8BaseTraitsNoAscii>(uc1, output, src1, end1); if (res < 0) { // decoding error uc1 = QChar::ReplacementCharacter; @@ -832,7 +854,10 @@ int QUtf8::compareUtf8(QByteArrayView utf8, QStringView utf16) noexcept if (QChar::isHighSurrogate(uc2) && src2 < end2 && QChar::isLowSurrogate(*src2)) uc2 = QChar::surrogateToUcs4(uc2, *src2++); } - + if (cs == Qt::CaseInsensitive) { + uc1 = QChar::toCaseFolded(uc1); + uc2 = QChar::toCaseFolded(uc2); + } if (uc1 != uc2) return int(uc1) - int(uc2); } @@ -842,7 +867,7 @@ int QUtf8::compareUtf8(QByteArrayView utf8, QStringView utf16) noexcept return (end1 > src1) - int(end2 > src2); } -int QUtf8::compareUtf8(QByteArrayView utf8, QLatin1StringView s) +int QUtf8::compareUtf8(QByteArrayView utf8, QLatin1StringView s, Qt::CaseSensitivity cs) { char32_t uc1 = QChar::Null; auto src1 = reinterpret_cast<const uchar *>(utf8.data()); @@ -853,13 +878,62 @@ int QUtf8::compareUtf8(QByteArrayView utf8, QLatin1StringView s) while (src1 < end1 && src2 < end2) { uchar b = *src1++; char32_t *output = &uc1; - int res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(b, output, src1, end1); + const qsizetype res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(b, output, src1, end1); if (res < 0) { // decoding error uc1 = QChar::ReplacementCharacter; } char32_t uc2 = *src2++; + if (cs == Qt::CaseInsensitive) { + uc1 = QChar::toCaseFolded(uc1); + uc2 = QChar::toCaseFolded(uc2); + } + if (uc1 != uc2) + return int(uc1) - int(uc2); + } + + // the shorter string sorts first + return (end1 > src1) - (end2 > src2); +} + +int QUtf8::compareUtf8(QByteArrayView lhs, QByteArrayView rhs, Qt::CaseSensitivity cs) noexcept +{ + if (lhs.isEmpty()) + return qt_lencmp(0, rhs.size()); + + if (cs == Qt::CaseSensitive) { + const auto l = std::min(lhs.size(), rhs.size()); + int r = memcmp(lhs.data(), rhs.data(), l); + return r ? r : qt_lencmp(lhs.size(), rhs.size()); + } + + char32_t uc1 = QChar::Null; + auto src1 = reinterpret_cast<const uchar *>(lhs.data()); + auto end1 = src1 + lhs.size(); + char32_t uc2 = QChar::Null; + auto src2 = reinterpret_cast<const uchar *>(rhs.data()); + auto end2 = src2 + rhs.size(); + + while (src1 < end1 && src2 < end2) { + uchar b = *src1++; + char32_t *output = &uc1; + qsizetype res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(b, output, src1, end1); + if (res < 0) { + // decoding error + uc1 = QChar::ReplacementCharacter; + } + + b = *src2++; + output = &uc2; + res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(b, output, src2, end2); + if (res < 0) { + // decoding error + uc2 = QChar::ReplacementCharacter; + } + + uc1 = QChar::toCaseFolded(uc1); + uc2 = QChar::toCaseFolded(uc2); if (uc1 != uc2) return int(uc1) - int(uc2); } @@ -868,6 +942,7 @@ int QUtf8::compareUtf8(QByteArrayView utf8, QLatin1StringView s) return (end1 > src1) - (end2 > src2); } +#ifndef QT_BOOTSTRAPPED QByteArray QUtf16::convertFromUnicode(QStringView in, QStringConverter::State *state, DataEndianness endian) { bool writeBom = !(state->internalState & HeaderDone) && state->flags & QStringConverter::Flag::WriteBom; @@ -877,7 +952,7 @@ QByteArray QUtf16::convertFromUnicode(QStringView in, QStringConverter::State *s QByteArray d(length, Qt::Uninitialized); char *end = convertFromUnicode(d.data(), in, state, endian); - Q_ASSERT(end - d.constData() == d.length()); + Q_ASSERT(end - d.constData() == d.size()); Q_UNUSED(end); return d; } @@ -900,13 +975,13 @@ char *QUtf16::convertFromUnicode(char *out, QStringView in, QStringConverter::St out += 2; } if (endian == BigEndianness) - qToBigEndian<char16_t>(in.data(), in.length(), out); + qToBigEndian<char16_t>(in.data(), in.size(), out); else - qToLittleEndian<char16_t>(in.data(), in.length(), out); + qToLittleEndian<char16_t>(in.data(), in.size(), out); state->remainingChars = 0; state->internalState |= HeaderDone; - return out + 2*in.length(); + return out + 2*in.size(); } QString QUtf16::convertToUnicode(QByteArrayView in, QStringConverter::State *state, DataEndianness endian) @@ -1036,7 +1111,7 @@ char *QUtf32::convertFromUnicode(char *out, QStringView in, QStringConverter::St } const QChar *uc = in.data(); - const QChar *end = in.data() + in.length(); + const QChar *end = in.data() + in.size(); QChar ch; char32_t ucs4; if (state->remainingChars == 1) { @@ -1177,6 +1252,7 @@ QChar *QUtf32::convertToUnicode(QChar *out, QByteArrayView in, QStringConverter: return out; } +#endif // !QT_BOOTSTRAPPED #if defined(Q_OS_WIN) && !defined(QT_BOOTSTRAPPED) int QLocal8Bit::checkUtf8() @@ -1184,186 +1260,365 @@ int QLocal8Bit::checkUtf8() return GetACP() == CP_UTF8 ? 1 : -1; } -static QString convertToUnicodeCharByChar(QByteArrayView in, QStringConverter::State *state) +QString QLocal8Bit::convertToUnicode_sys(QByteArrayView in, QStringConverter::State *state) { - qsizetype length = in.size(); - const char *chars = in.data(); - - Q_ASSERT(state); - if (state->flags & QStringConverter::Flag::Stateless) // temporary - state = nullptr; - - if (!chars || !length) - return QString(); - - qsizetype copyLocation = 0; - qsizetype extra = 2; - if (state && state->remainingChars) { - copyLocation = state->remainingChars; - extra += copyLocation; - } - qsizetype newLength = length + extra; - char *mbcs = new char[newLength]; - //ensure that we have a NULL terminated string - mbcs[newLength-1] = 0; - mbcs[newLength-2] = 0; - memcpy(&(mbcs[copyLocation]), chars, length); - if (copyLocation) { - //copy the last character from the state - mbcs[0] = (char)state->state_data[0]; - state->remainingChars = 0; - } - const char *mb = mbcs; - const char *next = 0; - QString s; - while ((next = CharNextExA(CP_ACP, mb, 0)) != mb) { - wchar_t wc[2] ={0}; - int charlength = int(next - mb); // always just a few bytes - int len = MultiByteToWideChar(CP_ACP, MB_PRECOMPOSED|MB_ERR_INVALID_CHARS, mb, charlength, wc, 2); - if (len>0) { - s.append(QChar(wc[0])); - } else { - int r = GetLastError(); - //check if the character being dropped is the last character - if (r == ERROR_NO_UNICODE_TRANSLATION && mb == (mbcs+newLength -3) && state) { - state->remainingChars = 1; - state->state_data[0] = (char)*mb; - } - } - mb = next; - } - delete [] mbcs; - return s; + return convertToUnicode_sys(in, CP_ACP, state); } - -QString QLocal8Bit::convertToUnicode_sys(QByteArrayView in, QStringConverter::State *state) +QString QLocal8Bit::convertToUnicode_sys(QByteArrayView in, quint32 codePage, + QStringConverter::State *state) { - qsizetype length = in.size(); - - Q_ASSERT(length < INT_MAX); // ### FIXME const char *mb = in.data(); - int mblen = length; + qsizetype mblen = in.size(); + + Q_ASSERT(state); + qsizetype &invalidChars = state->invalidChars; + using Flag = QStringConverter::Flag; + const bool useNullForReplacement = !!(state->flags & Flag::ConvertInvalidToNull); + const char16_t replacementCharacter = useNullForReplacement ? QChar::Null + : QChar::ReplacementCharacter; + if (state->flags & Flag::Stateless) { + Q_ASSERT(state->remainingChars == 0); + state = nullptr; + } if (!mb || !mblen) return QString(); - QVarLengthArray<wchar_t, 4096> wc(4096); - int len; + // Use a local stack-buffer at first to allow us a decently large container + // to avoid a lot of resizing, without also returning an overallocated + // QString to the user for small strings. + // Then we can be fast for small strings and take the hit of extra resizes + // and measuring how much storage is needed for large strings. + std::array<wchar_t, 4096> buf; + wchar_t *out = buf.data(); + qsizetype outlen = buf.size(); + QString sp; - bool prepend = false; - char state_data = 0; - int remainingChars = 0; - - //save the current state information - if (state) { - state_data = (char)state->state_data[0]; - remainingChars = state->remainingChars; - } - //convert the pending character (if available) - if (state && remainingChars) { - char prev[3] = {0}; - prev[0] = state_data; - prev[1] = mb[0]; - remainingChars = 0; - len = MultiByteToWideChar(CP_ACP, MB_PRECOMPOSED, - prev, 2, wc.data(), wc.length()); - if (len) { - sp.append(QChar(wc[0])); - if (mblen == 1) { - state->remainingChars = 0; - return sp; - } - prepend = true; - mb++; - mblen--; - wc[0] = 0; + // Return a pointer to storage where we have enough space for `size` + const auto growOut = [&](qsizetype size) -> std::tuple<wchar_t *, qsizetype> { + if (outlen >= size) + return {out, outlen}; + const bool wasStackBuffer = sp.isEmpty(); + const auto begin = wasStackBuffer ? buf.data() : reinterpret_cast<wchar_t *>(sp.data()); + const qsizetype offset = qsizetype(std::distance(begin, out)); + qsizetype newSize = 0; + if (Q_UNLIKELY(qAddOverflow(offset, size, &newSize))) { + Q_CHECK_PTR(false); + return {nullptr, 0}; + } + sp.resize(newSize); + auto it = reinterpret_cast<wchar_t *>(sp.data()); + if (wasStackBuffer) + it = std::copy_n(buf.data(), offset, it); + else + it += offset; + return {it, size}; + }; + + // Convert the pending characters (if available) + while (state && state->remainingChars && mblen) { + QStringConverter::State localState; + localState.flags = state->flags; + // Use at most 6 characters as a guess for the longest encoded character + // in any multibyte encoding. + // Even with a total of 2 bytes of overhead that would leave around + // 2^(4 * 8) possible characters + std::array<char, 6> prev = {0}; + Q_ASSERT(state->remainingChars <= q20::ssize(state->state_data)); + qsizetype index = 0; + for (; index < state->remainingChars; ++index) + prev[index] = state->state_data[index]; + const qsizetype toCopy = std::min(q20::ssize(prev) - index, mblen); + for (qsizetype i = 0; i < toCopy; ++i, ++index) + prev[index] = mb[i]; + mb += toCopy; + mblen -= toCopy; + + // Recursing: + // Since we are using a clean local state it will try to decode what was + // stored in our state + some extra octets from input (`prev`). If some + // part fails we will have those characters stored in the local state's + // storage, and we can extract those. It may also output some + // replacement characters, which we'll count in the invalidChars. + // In the best case we only do this once, but we will loop until we have + // resolved all the remaining characters or we have run out of new input + // in which case we may still have remaining characters. + const QString tmp = convertToUnicode_sys(QByteArrayView(prev.data(), index), codePage, + &localState); + std::tie(out, outlen) = growOut(tmp.size()); + if (!out) + return {}; + out = std::copy_n(reinterpret_cast<const wchar_t *>(tmp.constData()), tmp.size(), out); + outlen -= tmp.size(); + const qsizetype tail = toCopy - localState.remainingChars; + if (tail >= 0) { + // Everything left to process comes from `in`, so we can stop + // looping. Adjust the window for `in` and unset remainingChars to + // signal that we're done. + mb -= localState.remainingChars; + mblen += localState.remainingChars; + localState.remainingChars = 0; } + state->remainingChars = localState.remainingChars; + state->invalidChars += localState.invalidChars; + std::copy_n(localState.state_data, state->remainingChars, state->state_data); } - while (!(len=MultiByteToWideChar(CP_ACP, MB_PRECOMPOSED|MB_ERR_INVALID_CHARS, - mb, mblen, wc.data(), wc.length()))) { - int r = GetLastError(); - if (r == ERROR_INSUFFICIENT_BUFFER) { - const int wclen = MultiByteToWideChar(CP_ACP, MB_PRECOMPOSED, - mb, mblen, 0, 0); - wc.resize(wclen); - } else if (r == ERROR_NO_UNICODE_TRANSLATION) { - //find the last non NULL character - while (mblen > 1 && !(mb[mblen-1])) - mblen--; - //check whether, we hit an invalid character in the middle - if ((mblen <= 1) || (remainingChars && state_data)) - return convertToUnicodeCharByChar(in, state); - //Remove the last character and try again... - state_data = mb[mblen-1]; - remainingChars = 1; - mblen--; + Q_ASSERT(!state || state->remainingChars == 0 || mblen == 0); + + // Need it in this scope, since we try to decrease our window size if we + // encounter an error + int nextIn = qt_saturate<int>(mblen); + while (mblen > 0) { + std::tie(out, outlen) = growOut(1); // Need space for at least one character + if (!out) + return {}; + const int nextOut = qt_saturate<int>(outlen); + int len = MultiByteToWideChar(codePage, MB_ERR_INVALID_CHARS, mb, nextIn, out, nextOut); + if (len) { + mb += nextIn; + mblen -= nextIn; + out += len; + outlen -= len; } else { - // Fail. - qWarning("MultiByteToWideChar: Cannot convert multibyte text"); - break; + int r = GetLastError(); + if (r == ERROR_INSUFFICIENT_BUFFER) { + const int wclen = MultiByteToWideChar(codePage, 0, mb, nextIn, 0, 0); + std::tie(out, outlen) = growOut(wclen); + if (!out) + return {}; + } else if (r == ERROR_NO_UNICODE_TRANSLATION) { + // Can't decode the current window, so either store the state, + // reduce window size or output a replacement character. + + // Check if we can store all remaining characters in the state + // to be used next time we're called: + if (state && mblen <= q20::ssize(state->state_data)) { + state->remainingChars = mblen; + std::copy_n(mb, mblen, state->state_data); + mb += mblen; + mblen = 0; + break; + } + + // .. if not, try to find the last valid character in the window + // and try again with a shrunken window: + if (nextIn > 1) { + // There may be some incomplete data at the end of our current + // window, so decrease the window size and try again. + // In the worst case scenario there is gigs of undecodable + // garbage, but what are we supposed to do about that? + const auto it = CharPrevExA(codePage, mb, mb + nextIn, 0); + if (it != mb) + nextIn = int(it - mb); + else + --nextIn; + continue; + } + + // Finally, we are forced to output a replacement character for + // the first byte in the window: + std::tie(out, outlen) = growOut(1); + if (!out) + return {}; + *out = replacementCharacter; + ++invalidChars; + ++out; + --outlen; + ++mb; + --mblen; + } else { + // Fail. + qWarning("MultiByteToWideChar: Cannot convert multibyte text"); + break; + } } + nextIn = qt_saturate<int>(mblen); } - if (len <= 0) - return QString(); + if (sp.isEmpty()) { + // We must have only used the stack buffer + if (out != buf.data()) // else: we return null-string + sp = QStringView(buf.data(), out).toString(); + } else{ + const auto begin = reinterpret_cast<wchar_t *>(sp.data()); + sp.truncate(std::distance(begin, out)); + } - if (wc[len-1] == 0) // len - 1: we don't want terminator - --len; + if (sp.size() && sp.back().isNull()) + sp.chop(1); - //save the new state information - if (state) { - state->state_data[0] = (char)state_data; - state->remainingChars = remainingChars; - } - QString s((QChar*)wc.data(), len); - if (prepend) { - return sp+s; + if (!state && mblen > 0) { + // We have trailing character(s) that could not be converted, and + // nowhere to cache them + sp.resize(sp.size() + mblen, replacementCharacter); + invalidChars += mblen; } - return s; + return sp; } QByteArray QLocal8Bit::convertFromUnicode_sys(QStringView in, QStringConverter::State *state) { - const QChar *ch = in.data(); + return convertFromUnicode_sys(in, CP_ACP, state); +} + +QByteArray QLocal8Bit::convertFromUnicode_sys(QStringView in, quint32 codePage, + QStringConverter::State *state) +{ + const wchar_t *ch = reinterpret_cast<const wchar_t *>(in.data()); qsizetype uclen = in.size(); - Q_ASSERT(uclen < INT_MAX); // ### FIXME Q_ASSERT(state); - Q_UNUSED(state); // ### Fixme - if (state->flags & QStringConverter::Flag::Stateless) // temporary + // The Windows API has a *boolean* out-parameter that says if a replacement + // character was used, but it gives us no way to know _how many_ were used. + // Since we cannot simply scan the string for replacement characters + // (which is potentially a question mark, and thus a valid character), + // we simply do not track the number of invalid characters here. + // auto &invalidChars = state->invalidChars; + + using Flag = QStringConverter::Flag; + if (state->flags & Flag::Stateless) { // temporary + Q_ASSERT(state->remainingChars == 0); state = nullptr; + } if (!ch) return QByteArray(); if (uclen == 0) return QByteArray(""); - BOOL used_def; - QByteArray mb(4096, 0); - int len; - while (!(len=WideCharToMultiByte(CP_ACP, 0, (const wchar_t*)ch, uclen, - mb.data(), mb.size()-1, 0, &used_def))) - { - int r = GetLastError(); - if (r == ERROR_INSUFFICIENT_BUFFER) { - mb.resize(1+WideCharToMultiByte(CP_ACP, 0, - (const wchar_t*)ch, uclen, - 0, 0, 0, &used_def)); - // and try again... + + // Use a local stack-buffer at first to allow us a decently large container + // to avoid a lot of resizing, without also returning an overallocated + // QByteArray to the user for small strings. + // Then we can be fast for small strings and take the hit of extra resizes + // and measuring how much storage is needed for large strings. + std::array<char, 4096> buf; + char *out = buf.data(); + qsizetype outlen = buf.size(); + QByteArray mb; + + if (state && state->remainingChars > 0) { + Q_ASSERT(state->remainingChars == 1); + // Let's try to decode the pending character + wchar_t wc[2] = { wchar_t(state->state_data[0]), ch[0] }; + // Check if the second character is a valid low surrogate, + // otherwise we'll just decode the first character, for which windows + // will output a replacement character. + const bool validCodePoint = QChar::isLowSurrogate(wc[1]); + int len = WideCharToMultiByte(codePage, 0, wc, validCodePoint ? 2 : 1, out, outlen, nullptr, + nullptr); + if (!len) + return {}; // Cannot recover, and I refuse to believe it was a size limitation + out += len; + outlen -= len; + if (validCodePoint) { + ++ch; + --uclen; + } + state->remainingChars = 0; + state->state_data[0] = 0; + if (uclen == 0) + return QByteArrayView(buf.data(), len).toByteArray(); + } + + if (state && QChar::isHighSurrogate(ch[uclen - 1])) { + // We can handle a missing low surrogate at the end of the string, + // so if there is one, exclude it now and store it in the state. + state->remainingChars = 1; + state->state_data[0] = ch[uclen - 1]; + --uclen; + if (uclen == 0) + return QByteArray(); + } + + Q_ASSERT(uclen > 0); + + // Return a pointer to storage where we have enough space for `size` + const auto growOut = [&](qsizetype size) -> std::tuple<char *, qsizetype> { + if (outlen >= size) + return {out, outlen}; + const bool wasStackBuffer = mb.isEmpty(); + const auto begin = wasStackBuffer ? buf.data() : mb.data(); + const qsizetype offset = qsizetype(std::distance(begin, out)); + qsizetype newSize = 0; + if (Q_UNLIKELY(qAddOverflow(offset, size, &newSize))) { + Q_CHECK_PTR(false); + return {nullptr, 0}; + } + mb.resize(newSize); + auto it = mb.data(); + if (wasStackBuffer) + it = std::copy_n(buf.data(), offset, it); + else + it += offset; + return {it, size}; + }; + + const auto getNextWindowSize = [&]() { + int nextIn = qt_saturate<int>(uclen); + // The Windows API has some issues if the current window ends in the + // middle of a surrogate pair, so we avoid that: + if (nextIn > 1 && QChar::isHighSurrogate(ch[nextIn - 1])) + --nextIn; + return nextIn; + }; + + int len = 0; + while (uclen > 0) { + const int nextIn = getNextWindowSize(); + std::tie(out, outlen) = growOut(1); // We need at least one byte + if (!out) + return {}; + const int nextOut = qt_saturate<int>(outlen); + len = WideCharToMultiByte(codePage, 0, ch, nextIn, out, nextOut, nullptr, nullptr); + if (len > 0) { + ch += nextIn; + uclen -= nextIn; + out += len; + outlen -= len; } else { - // Fail. Probably can't happen in fact (dwFlags is 0). + int r = GetLastError(); + if (r == ERROR_INSUFFICIENT_BUFFER) { + int neededLength = WideCharToMultiByte(codePage, 0, ch, nextIn, nullptr, 0, + nullptr, nullptr); + if (neededLength <= 0) { + // Fail. Observed with UTF8 where the input window was max int and ended in an + // incomplete sequence, probably a Windows bug. We try to avoid that from + // happening by reducing the window size in that case. But let's keep this + // branch just in case of other bugs. #ifndef QT_NO_DEBUG - // Can't use qWarning(), as it'll recurse to handle %ls - fprintf(stderr, - "WideCharToMultiByte: Cannot convert multibyte text (error %d): %ls\n", - r, reinterpret_cast<const wchar_t*>(QString(ch, uclen).utf16())); + r = GetLastError(); + fprintf(stderr, + "WideCharToMultiByte: Cannot convert multibyte text (error %d)\n", r); +#endif // !QT_NO_DEBUG + break; + } + std::tie(out, outlen) = growOut(neededLength); + if (!out) + return {}; + // and try again... + } else { + // Fail. Probably can't happen in fact (dwFlags is 0). +#ifndef QT_NO_DEBUG + // Can't use qWarning(), as it'll recurse to handle %ls + fprintf(stderr, + "WideCharToMultiByte: Cannot convert multibyte text (error %d): %ls\n", r, + reinterpret_cast<const wchar_t *>( + QStringView(ch, uclen).left(100).toString().utf16())); #endif - break; + break; + } } } - mb.resize(len); + if (mb.isEmpty()) { + // We must have only used the stack buffer + if (out != buf.data()) // else: we return null-array + mb = QByteArrayView(buf.data(), out).toByteArray(); + } else { + mb.truncate(std::distance(mb.data(), out)); + } return mb; } #endif @@ -1394,6 +1649,7 @@ void QStringConverter::State::reset() noexcept } } +#ifndef QT_BOOTSTRAPPED static QChar *fromUtf16(QChar *out, QByteArrayView in, QStringConverter::State *state) { return QUtf16::convertToUnicode(out, in, state, DetectEndianness); @@ -1453,20 +1709,9 @@ static char *toUtf32LE(char *out, QStringView in, QStringConverter::State *state { return QUtf32::convertFromUnicode(out, in, state, LittleEndianness); } +#endif // !QT_BOOTSTRAPPED -void qt_from_latin1(char16_t *dst, const char *str, size_t size) noexcept; - -static QChar *fromLatin1(QChar *out, QByteArrayView in, QStringConverter::State *state) -{ - Q_ASSERT(state); - Q_UNUSED(state); - - qt_from_latin1(reinterpret_cast<char16_t *>(out), in.data(), size_t(in.size())); - return out + in.size(); -} - - -static char *toLatin1(char *out, QStringView in, QStringConverter::State *state) +char *QLatin1::convertFromUnicode(char *out, QStringView in, QStringConverter::State *state) noexcept { Q_ASSERT(state); if (state->flags & QStringConverter::Flag::Stateless) // temporary @@ -1474,7 +1719,7 @@ static char *toLatin1(char *out, QStringView in, QStringConverter::State *state) const char replacement = (state && state->flags & QStringConverter::Flag::ConvertInvalidToNull) ? 0 : '?'; qsizetype invalid = 0; - for (qsizetype i = 0; i < in.length(); ++i) { + for (qsizetype i = 0; i < in.size(); ++i) { if (in[i] > QChar(0xff)) { *out = replacement; ++invalid; @@ -1491,26 +1736,28 @@ static char *toLatin1(char *out, QStringView in, QStringConverter::State *state) static QChar *fromLocal8Bit(QChar *out, QByteArrayView in, QStringConverter::State *state) { QString s = QLocal8Bit::convertToUnicode(in, state); - memcpy(out, s.constData(), s.length()*sizeof(QChar)); - return out + s.length(); + memcpy(out, s.constData(), s.size()*sizeof(QChar)); + return out + s.size(); } static char *toLocal8Bit(char *out, QStringView in, QStringConverter::State *state) { QByteArray s = QLocal8Bit::convertFromUnicode(in, state); - memcpy(out, s.constData(), s.length()); - return out + s.length(); + memcpy(out, s.constData(), s.size()); + return out + s.size(); } static qsizetype fromUtf8Len(qsizetype l) { return l + 1; } static qsizetype toUtf8Len(qsizetype l) { return 3*(l + 1); } +#ifndef QT_BOOTSTRAPPED static qsizetype fromUtf16Len(qsizetype l) { return l/2 + 2; } static qsizetype toUtf16Len(qsizetype l) { return 2*(l + 1); } static qsizetype fromUtf32Len(qsizetype l) { return l/2 + 2; } static qsizetype toUtf32Len(qsizetype l) { return 4*(l + 1); } +#endif static qsizetype fromLatin1Len(qsizetype l) { return l + 1; } static qsizetype toLatin1Len(qsizetype l) { return l + 1; } @@ -1543,7 +1790,7 @@ static qsizetype toLatin1Len(qsizetype l) { return l + 1; } operation, encoding UTF-16 encoded data (usually in the form of a QString) to the requested encoding. - The supported encodings are: + The following encodings are always supported: \list \li UTF-8 @@ -1557,6 +1804,10 @@ static qsizetype toLatin1Len(qsizetype l) { return l + 1; } \li The system encoding \endlist + QStringConverter may support more encodings depending on how Qt was + compiled. If more codecs are supported, they can be listed using + availableCodecs(). + \l {QStringConverter}s can be used as follows to convert some encoded string to and from UTF-16. @@ -1646,34 +1897,31 @@ static qsizetype toLatin1Len(qsizetype l) { return l + 1; } const QStringConverter::Interface QStringConverter::encodingInterfaces[QStringConverter::LastEncoding + 1] = { { "UTF-8", QUtf8::convertToUnicode, fromUtf8Len, QUtf8::convertFromUnicode, toUtf8Len }, +#ifndef QT_BOOTSTRAPPED { "UTF-16", fromUtf16, fromUtf16Len, toUtf16, toUtf16Len }, { "UTF-16LE", fromUtf16LE, fromUtf16Len, toUtf16LE, toUtf16Len }, { "UTF-16BE", fromUtf16BE, fromUtf16Len, toUtf16BE, toUtf16Len }, { "UTF-32", fromUtf32, fromUtf32Len, toUtf32, toUtf32Len }, { "UTF-32LE", fromUtf32LE, fromUtf32Len, toUtf32LE, toUtf32Len }, { "UTF-32BE", fromUtf32BE, fromUtf32Len, toUtf32BE, toUtf32Len }, - { "ISO-8859-1", fromLatin1, fromLatin1Len, toLatin1, toLatin1Len }, +#endif + { "ISO-8859-1", QLatin1::convertToUnicode, fromLatin1Len, QLatin1::convertFromUnicode, toLatin1Len }, { "Locale", fromLocal8Bit, fromUtf8Len, toLocal8Bit, toUtf8Len } }; // match names case insensitive and skipping '-' and '_' static bool nameMatch(const char *a, const char *b) { - while (*a && *b) { - if (*a == '-' || *a == '_') { + do { + while (*a == '-' || *a == '_') ++a; - continue; - } - if (*b == '-' || *b == '_') { + while (*b == '-' || *b == '_') ++b; - continue; - } - if (QtMiscUtils::toAsciiLower(*a) != QtMiscUtils::toAsciiLower(*b)) - return false; - ++a; - ++b; - } - return !*a && !*b; + if (!*a && !*b) // end of both strings + return true; + } while (QtMiscUtils::toAsciiLower(*a++) == QtMiscUtils::toAsciiLower(*b++)); + + return false; } @@ -1728,7 +1976,7 @@ struct QStringConverterICU : QStringConverter const void *context; ucnv_getToUCallBack(icu_conv, &action, &context); if (context != state) - ucnv_setToUCallBack(icu_conv, action, &state, nullptr, nullptr, &err); + ucnv_setToUCallBack(icu_conv, action, state, nullptr, nullptr, &err); ucnv_toUnicode(icu_conv, &target, targetLimit, &source, sourceLimit, nullptr, flush, &err); // We did reserve enough space: @@ -1750,7 +1998,7 @@ struct QStringConverterICU : QStringConverter auto source = reinterpret_cast<const UChar *>(in.data()); auto sourceLimit = reinterpret_cast<const UChar *>(in.data() + in.size()); - qsizetype length = UCNV_GET_MAX_BYTES_FOR_STRING(in.length(), ucnv_getMaxCharSize(icu_conv)); + qsizetype length = UCNV_GET_MAX_BYTES_FOR_STRING(in.size(), ucnv_getMaxCharSize(icu_conv)); char *target = out; char *targetLimit = out + length; @@ -1761,7 +2009,7 @@ struct QStringConverterICU : QStringConverter const void *context; ucnv_getFromUCallBack(icu_conv, &action, &context); if (context != state) - ucnv_setFromUCallBack(icu_conv, action, &state, nullptr, nullptr, &err); + ucnv_setFromUCallBack(icu_conv, action, state, nullptr, nullptr, &err); ucnv_fromUnicode(icu_conv, &target, targetLimit, &source, sourceLimit, nullptr, flush, &err); // We did reserve enough space: @@ -1976,6 +2224,7 @@ const char *QStringConverter::name() const noexcept Returns the canonical name of the encoding this QStringConverter can encode or decode. Returns a nullptr if the converter is not valid. + The returned name is UTF-8 encoded. \sa isValid() */ @@ -1987,9 +2236,13 @@ const char *QStringConverter::name() const noexcept \c{std::nullopt} is returned. Such a name may, none the less, be accepted by the QStringConverter constructor when Qt is built with ICU, if ICU provides a converter with the given name. + + \a name is expected to be UTF-8 encoded. */ std::optional<QStringConverter::Encoding> QStringConverter::encodingForName(const char *name) noexcept { + if (!name) + return std::nullopt; for (qsizetype i = 0; i < LastEncoding + 1; ++i) { if (nameMatch(encodingInterfaces[i].name, name)) return QStringConverter::Encoding(i); @@ -1999,6 +2252,7 @@ std::optional<QStringConverter::Encoding> QStringConverter::encodingForName(cons return std::nullopt; } +#ifndef QT_BOOTSTRAPPED /*! Returns the encoding for the content of \a data if it can be determined. \a expectedFirstCharacter can be passed as an additional hint to help determine @@ -2107,6 +2361,63 @@ std::optional<QStringConverter::Encoding> QStringConverter::encodingForHtml(QByt return Utf8; } +static qsizetype availableCodecCount() +{ +#if !QT_CONFIG(icu) + return QStringConverter::Encoding::LastEncoding; +#else + /* icu contains also the names of what Qt provides + except for the special Locale one (so add one for it) + */ + return 1 + ucnv_countAvailable(); +#endif +} + +/*! + Returns a list of names of supported codecs. The names returned + by this function can be passed to QStringEncoder's and + QStringDecoder's constructor to create a en- or decoder for + the given codec. + + This function may be used to obtain a listing of additional codecs beyond + the standard ones. Support for additional codecs requires Qt be compiled + with support for the ICU library. + + \note The order of codecs is an internal implementation detail + and not guaranteed to be stable. + */ +QStringList QStringConverter::availableCodecs() +{ + auto availableCodec = [](qsizetype index) -> QString + { + #if !QT_CONFIG(icu) + return QString::fromLatin1(encodingInterfaces[index].name); + #else + if (index == 0) // "Locale", not provided by icu + return QString::fromLatin1( + encodingInterfaces[QStringConverter::Encoding::System].name); + // this mirrors the setup we do to set a converters name + UErrorCode status = U_ZERO_ERROR; + auto icuName = ucnv_getAvailableName(int32_t(index - 1)); + const char *standardName = ucnv_getStandardName(icuName, "MIME", &status); + if (U_FAILURE(status) || !standardName) { + status = U_ZERO_ERROR; + standardName = ucnv_getStandardName(icuName, "IANA", &status); + } + if (!standardName) + standardName = icuName; + return QString::fromLatin1(standardName); + #endif + }; + + qsizetype codecCount = availableCodecCount(); + QStringList result; + result.reserve(codecCount); + for (qsizetype i = 0; i < codecCount; ++i) + result.push_back(availableCodec(i)); + return result; +} + /*! Tries to determine the encoding of the HTML in \a data by looking at leading byte order marks or a charset specifier in the HTML meta tag and returns a QStringDecoder @@ -2130,7 +2441,7 @@ QStringDecoder QStringDecoder::decoderForHtml(QByteArrayView data) return QStringDecoder(Utf8); } - +#endif // !QT_BOOTSTRAPPED /*! Returns the canonical name for encoding \a e. @@ -2199,12 +2510,24 @@ const char *QStringConverter::nameForEncoding(QStringConverter::Encoding e) */ /*! - \fn QByteArray QStringEncoder::encode(const QString &in) - \fn QByteArray QStringEncoder::encode(QStringView in) - \fn QByteArray QStringEncoder::operator()(const QString &in) - \fn QByteArray QStringEncoder::operator()(QStringView in) + \fn constexpr QStringEncoder::QStringEncoder(const QString &name, Flags flags = Flag::Default) + \since 6.8 - Converts \a in and returns the data as a byte array. + Creates an encoder object using \a name and \a flags. + If \a name is not the name of a known encoding an invalid converter will get created. + + \sa isValid() +*/ + +/*! + \fn QStringEncoder::DecodedData<const QString &> QStringEncoder::encode(const QString &in) + \fn QStringEncoder::DecodedData<QStringView> QStringEncoder::encode(QStringView in) + \fn QStringEncoder::DecodedData<const QString &> QStringEncoder::operator()(const QString &in) + \fn QStringEncoder::DecodedData<QStringView> QStringEncoder::operator()(QStringView in) + + Converts \a in and returns a struct that is implicitly convertible to QByteArray. + + \snippet code/src_corelib_text_qstringconverter.cpp 5 */ /*! @@ -2288,12 +2611,25 @@ const char *QStringConverter::nameForEncoding(QStringConverter::Encoding e) */ /*! - \fn QString QStringDecoder::operator()(const QByteArray &ba) - \fn QString QStringDecoder::decode(const QByteArray &ba) - \fn QString QStringDecoder::operator()(QByteArrayView ba) - \fn QString QStringDecoder::decode(QByteArrayView ba) + \fn constexpr QStringDecoder::QStringDecoder(const QString &name, Flags flags = Flag::Default) + \since 6.8 + + Creates an decoder object using \a name and \a flags. + If \a name is not the name of a known encoding an invalid converter will get created. - Converts \a ba and returns the data as a QString. + \sa isValid() +*/ + +/*! + \fn QStringDecoder::EncodedData<const QByteArray &> QStringDecoder::operator()(const QByteArray &ba) + \fn QStringDecoder::EncodedData<const QByteArray &> QStringDecoder::decode(const QByteArray &ba) + \fn QStringDecoder::EncodedData<QByteArrayView> QStringDecoder::operator()(QByteArrayView ba) + \fn QStringDecoder::EncodedData<QByteArrayView> QStringDecoder::decode(QByteArrayView ba) + + Converts \a ba and returns a struct that is implicitly convertible to QString. + + + \snippet code/src_corelib_text_qstringconverter.cpp 4 */ /*! @@ -2318,4 +2654,10 @@ const char *QStringConverter::nameForEncoding(QStringConverter::Encoding e) \sa requiredSpace */ +/*! + \fn char16_t *QStringDecoder::appendToBuffer(char16_t *out, QByteArrayView in) + \since 6.6 + \overload +*/ + QT_END_NAMESPACE |