diff options
Diffstat (limited to 'src/corelib/text/qstringconverter.cpp')
-rw-r--r-- | src/corelib/text/qstringconverter.cpp | 624 |
1 files changed, 455 insertions, 169 deletions
diff --git a/src/corelib/text/qstringconverter.cpp b/src/corelib/text/qstringconverter.cpp index 90f277ba61..67c75d708e 100644 --- a/src/corelib/text/qstringconverter.cpp +++ b/src/corelib/text/qstringconverter.cpp @@ -10,6 +10,8 @@ #include "private/qstringiterator_p.h" #include "private/qtools_p.h" #include "qbytearraymatcher.h" +#include "qcontainertools_impl.h" +#include <QtCore/qbytearraylist.h> #if QT_CONFIG(icu) #include <unicode/ucnv.h> @@ -22,9 +24,13 @@ #include <qt_windows.h> #ifndef QT_BOOTSTRAPPED #include <QtCore/qvarlengtharray.h> +#include <QtCore/q20iterator.h> +#include <QtCore/private/qnumeric_p.h> #endif // !QT_BOOTSTRAPPED #endif +#include <array> + #if __has_include(<bit>) && __cplusplus > 201703L #include <bit> #endif @@ -202,7 +208,7 @@ static inline const uchar *simdFindNonAscii(const uchar *src, const uchar *end, continue; uint n = _mm256_movemask_epi8(data); - Q_ASSUME(n); + Q_ASSERT(n); // find the next probable ASCII character // we don't want to load 32 bytes again in this loop if we know there are non-ASCII @@ -936,6 +942,7 @@ int QUtf8::compareUtf8(QByteArrayView lhs, QByteArrayView rhs, Qt::CaseSensitivi return (end1 > src1) - (end2 > src2); } +#ifndef QT_BOOTSTRAPPED QByteArray QUtf16::convertFromUnicode(QStringView in, QStringConverter::State *state, DataEndianness endian) { bool writeBom = !(state->internalState & HeaderDone) && state->flags & QStringConverter::Flag::WriteBom; @@ -1245,6 +1252,7 @@ QChar *QUtf32::convertToUnicode(QChar *out, QByteArrayView in, QStringConverter: return out; } +#endif // !QT_BOOTSTRAPPED #if defined(Q_OS_WIN) && !defined(QT_BOOTSTRAPPED) int QLocal8Bit::checkUtf8() @@ -1252,186 +1260,365 @@ int QLocal8Bit::checkUtf8() return GetACP() == CP_UTF8 ? 1 : -1; } -static QString convertToUnicodeCharByChar(QByteArrayView in, QStringConverter::State *state) +QString QLocal8Bit::convertToUnicode_sys(QByteArrayView in, QStringConverter::State *state) { - qsizetype length = in.size(); - const char *chars = in.data(); - - Q_ASSERT(state); - if (state->flags & QStringConverter::Flag::Stateless) // temporary - state = nullptr; - - if (!chars || !length) - return QString(); - - qsizetype copyLocation = 0; - qsizetype extra = 2; - if (state && state->remainingChars) { - copyLocation = state->remainingChars; - extra += copyLocation; - } - qsizetype newLength = length + extra; - char *mbcs = new char[newLength]; - //ensure that we have a NULL terminated string - mbcs[newLength-1] = 0; - mbcs[newLength-2] = 0; - memcpy(&(mbcs[copyLocation]), chars, length); - if (copyLocation) { - //copy the last character from the state - mbcs[0] = (char)state->state_data[0]; - state->remainingChars = 0; - } - const char *mb = mbcs; - const char *next = 0; - QString s; - while ((next = CharNextExA(CP_ACP, mb, 0)) != mb) { - wchar_t wc[2] ={0}; - int charlength = int(next - mb); // always just a few bytes - int len = MultiByteToWideChar(CP_ACP, MB_PRECOMPOSED|MB_ERR_INVALID_CHARS, mb, charlength, wc, 2); - if (len>0) { - s.append(QChar(wc[0])); - } else { - int r = GetLastError(); - //check if the character being dropped is the last character - if (r == ERROR_NO_UNICODE_TRANSLATION && mb == (mbcs+newLength -3) && state) { - state->remainingChars = 1; - state->state_data[0] = (char)*mb; - } - } - mb = next; - } - delete [] mbcs; - return s; + return convertToUnicode_sys(in, CP_ACP, state); } - -QString QLocal8Bit::convertToUnicode_sys(QByteArrayView in, QStringConverter::State *state) +QString QLocal8Bit::convertToUnicode_sys(QByteArrayView in, quint32 codePage, + QStringConverter::State *state) { - qsizetype length = in.size(); - - Q_ASSERT(length < INT_MAX); // ### FIXME const char *mb = in.data(); - int mblen = length; + qsizetype mblen = in.size(); + + Q_ASSERT(state); + qsizetype &invalidChars = state->invalidChars; + using Flag = QStringConverter::Flag; + const bool useNullForReplacement = !!(state->flags & Flag::ConvertInvalidToNull); + const char16_t replacementCharacter = useNullForReplacement ? QChar::Null + : QChar::ReplacementCharacter; + if (state->flags & Flag::Stateless) { + Q_ASSERT(state->remainingChars == 0); + state = nullptr; + } if (!mb || !mblen) return QString(); - QVarLengthArray<wchar_t, 4096> wc(4096); - int len; + // Use a local stack-buffer at first to allow us a decently large container + // to avoid a lot of resizing, without also returning an overallocated + // QString to the user for small strings. + // Then we can be fast for small strings and take the hit of extra resizes + // and measuring how much storage is needed for large strings. + std::array<wchar_t, 4096> buf; + wchar_t *out = buf.data(); + qsizetype outlen = buf.size(); + QString sp; - bool prepend = false; - char state_data = 0; - int remainingChars = 0; - - //save the current state information - if (state) { - state_data = (char)state->state_data[0]; - remainingChars = state->remainingChars; - } - //convert the pending character (if available) - if (state && remainingChars) { - char prev[3] = {0}; - prev[0] = state_data; - prev[1] = mb[0]; - remainingChars = 0; - len = MultiByteToWideChar(CP_ACP, MB_PRECOMPOSED, - prev, 2, wc.data(), wc.length()); - if (len) { - sp.append(QChar(wc[0])); - if (mblen == 1) { - state->remainingChars = 0; - return sp; - } - prepend = true; - mb++; - mblen--; - wc[0] = 0; + // Return a pointer to storage where we have enough space for `size` + const auto growOut = [&](qsizetype size) -> std::tuple<wchar_t *, qsizetype> { + if (outlen >= size) + return {out, outlen}; + const bool wasStackBuffer = sp.isEmpty(); + const auto begin = wasStackBuffer ? buf.data() : reinterpret_cast<wchar_t *>(sp.data()); + const qsizetype offset = qsizetype(std::distance(begin, out)); + qsizetype newSize = 0; + if (Q_UNLIKELY(qAddOverflow(offset, size, &newSize))) { + Q_CHECK_PTR(false); + return {nullptr, 0}; + } + sp.resize(newSize); + auto it = reinterpret_cast<wchar_t *>(sp.data()); + if (wasStackBuffer) + it = std::copy_n(buf.data(), offset, it); + else + it += offset; + return {it, size}; + }; + + // Convert the pending characters (if available) + while (state && state->remainingChars && mblen) { + QStringConverter::State localState; + localState.flags = state->flags; + // Use at most 6 characters as a guess for the longest encoded character + // in any multibyte encoding. + // Even with a total of 2 bytes of overhead that would leave around + // 2^(4 * 8) possible characters + std::array<char, 6> prev = {0}; + Q_ASSERT(state->remainingChars <= q20::ssize(state->state_data)); + qsizetype index = 0; + for (; index < state->remainingChars; ++index) + prev[index] = state->state_data[index]; + const qsizetype toCopy = std::min(q20::ssize(prev) - index, mblen); + for (qsizetype i = 0; i < toCopy; ++i, ++index) + prev[index] = mb[i]; + mb += toCopy; + mblen -= toCopy; + + // Recursing: + // Since we are using a clean local state it will try to decode what was + // stored in our state + some extra octets from input (`prev`). If some + // part fails we will have those characters stored in the local state's + // storage, and we can extract those. It may also output some + // replacement characters, which we'll count in the invalidChars. + // In the best case we only do this once, but we will loop until we have + // resolved all the remaining characters or we have run out of new input + // in which case we may still have remaining characters. + const QString tmp = convertToUnicode_sys(QByteArrayView(prev.data(), index), codePage, + &localState); + std::tie(out, outlen) = growOut(tmp.size()); + if (!out) + return {}; + out = std::copy_n(reinterpret_cast<const wchar_t *>(tmp.constData()), tmp.size(), out); + outlen -= tmp.size(); + const qsizetype tail = toCopy - localState.remainingChars; + if (tail >= 0) { + // Everything left to process comes from `in`, so we can stop + // looping. Adjust the window for `in` and unset remainingChars to + // signal that we're done. + mb -= localState.remainingChars; + mblen += localState.remainingChars; + localState.remainingChars = 0; } + state->remainingChars = localState.remainingChars; + state->invalidChars += localState.invalidChars; + std::copy_n(localState.state_data, state->remainingChars, state->state_data); } - while (!(len=MultiByteToWideChar(CP_ACP, MB_PRECOMPOSED|MB_ERR_INVALID_CHARS, - mb, mblen, wc.data(), wc.length()))) { - int r = GetLastError(); - if (r == ERROR_INSUFFICIENT_BUFFER) { - const int wclen = MultiByteToWideChar(CP_ACP, MB_PRECOMPOSED, - mb, mblen, 0, 0); - wc.resize(wclen); - } else if (r == ERROR_NO_UNICODE_TRANSLATION) { - //find the last non NULL character - while (mblen > 1 && !(mb[mblen-1])) - mblen--; - //check whether, we hit an invalid character in the middle - if ((mblen <= 1) || (remainingChars && state_data)) - return convertToUnicodeCharByChar(in, state); - //Remove the last character and try again... - state_data = mb[mblen-1]; - remainingChars = 1; - mblen--; + Q_ASSERT(!state || state->remainingChars == 0 || mblen == 0); + + // Need it in this scope, since we try to decrease our window size if we + // encounter an error + int nextIn = qt_saturate<int>(mblen); + while (mblen > 0) { + std::tie(out, outlen) = growOut(1); // Need space for at least one character + if (!out) + return {}; + const int nextOut = qt_saturate<int>(outlen); + int len = MultiByteToWideChar(codePage, MB_ERR_INVALID_CHARS, mb, nextIn, out, nextOut); + if (len) { + mb += nextIn; + mblen -= nextIn; + out += len; + outlen -= len; } else { - // Fail. - qWarning("MultiByteToWideChar: Cannot convert multibyte text"); - break; + int r = GetLastError(); + if (r == ERROR_INSUFFICIENT_BUFFER) { + const int wclen = MultiByteToWideChar(codePage, 0, mb, nextIn, 0, 0); + std::tie(out, outlen) = growOut(wclen); + if (!out) + return {}; + } else if (r == ERROR_NO_UNICODE_TRANSLATION) { + // Can't decode the current window, so either store the state, + // reduce window size or output a replacement character. + + // Check if we can store all remaining characters in the state + // to be used next time we're called: + if (state && mblen <= q20::ssize(state->state_data)) { + state->remainingChars = mblen; + std::copy_n(mb, mblen, state->state_data); + mb += mblen; + mblen = 0; + break; + } + + // .. if not, try to find the last valid character in the window + // and try again with a shrunken window: + if (nextIn > 1) { + // There may be some incomplete data at the end of our current + // window, so decrease the window size and try again. + // In the worst case scenario there is gigs of undecodable + // garbage, but what are we supposed to do about that? + const auto it = CharPrevExA(codePage, mb, mb + nextIn, 0); + if (it != mb) + nextIn = int(it - mb); + else + --nextIn; + continue; + } + + // Finally, we are forced to output a replacement character for + // the first byte in the window: + std::tie(out, outlen) = growOut(1); + if (!out) + return {}; + *out = replacementCharacter; + ++invalidChars; + ++out; + --outlen; + ++mb; + --mblen; + } else { + // Fail. + qWarning("MultiByteToWideChar: Cannot convert multibyte text"); + break; + } } + nextIn = qt_saturate<int>(mblen); } - if (len <= 0) - return QString(); + if (sp.isEmpty()) { + // We must have only used the stack buffer + if (out != buf.data()) // else: we return null-string + sp = QStringView(buf.data(), out).toString(); + } else{ + const auto begin = reinterpret_cast<wchar_t *>(sp.data()); + sp.truncate(std::distance(begin, out)); + } - if (wc[len-1] == 0) // len - 1: we don't want terminator - --len; + if (sp.size() && sp.back().isNull()) + sp.chop(1); - //save the new state information - if (state) { - state->state_data[0] = (char)state_data; - state->remainingChars = remainingChars; + if (!state && mblen > 0) { + // We have trailing character(s) that could not be converted, and + // nowhere to cache them + sp.resize(sp.size() + mblen, replacementCharacter); + invalidChars += mblen; } - QString s((QChar*)wc.data(), len); - if (prepend) { - return sp+s; - } - return s; + return sp; } QByteArray QLocal8Bit::convertFromUnicode_sys(QStringView in, QStringConverter::State *state) { - const QChar *ch = in.data(); + return convertFromUnicode_sys(in, CP_ACP, state); +} + +QByteArray QLocal8Bit::convertFromUnicode_sys(QStringView in, quint32 codePage, + QStringConverter::State *state) +{ + const wchar_t *ch = reinterpret_cast<const wchar_t *>(in.data()); qsizetype uclen = in.size(); - Q_ASSERT(uclen < INT_MAX); // ### FIXME Q_ASSERT(state); - Q_UNUSED(state); // ### Fixme - if (state->flags & QStringConverter::Flag::Stateless) // temporary + // The Windows API has a *boolean* out-parameter that says if a replacement + // character was used, but it gives us no way to know _how many_ were used. + // Since we cannot simply scan the string for replacement characters + // (which is potentially a question mark, and thus a valid character), + // we simply do not track the number of invalid characters here. + // auto &invalidChars = state->invalidChars; + + using Flag = QStringConverter::Flag; + if (state->flags & Flag::Stateless) { // temporary + Q_ASSERT(state->remainingChars == 0); state = nullptr; + } if (!ch) return QByteArray(); if (uclen == 0) return QByteArray(""); - BOOL used_def; - QByteArray mb(4096, 0); - int len; - while (!(len=WideCharToMultiByte(CP_ACP, 0, (const wchar_t*)ch, uclen, - mb.data(), mb.size()-1, 0, &used_def))) - { - int r = GetLastError(); - if (r == ERROR_INSUFFICIENT_BUFFER) { - mb.resize(1+WideCharToMultiByte(CP_ACP, 0, - (const wchar_t*)ch, uclen, - 0, 0, 0, &used_def)); - // and try again... + + // Use a local stack-buffer at first to allow us a decently large container + // to avoid a lot of resizing, without also returning an overallocated + // QByteArray to the user for small strings. + // Then we can be fast for small strings and take the hit of extra resizes + // and measuring how much storage is needed for large strings. + std::array<char, 4096> buf; + char *out = buf.data(); + qsizetype outlen = buf.size(); + QByteArray mb; + + if (state && state->remainingChars > 0) { + Q_ASSERT(state->remainingChars == 1); + // Let's try to decode the pending character + wchar_t wc[2] = { wchar_t(state->state_data[0]), ch[0] }; + // Check if the second character is a valid low surrogate, + // otherwise we'll just decode the first character, for which windows + // will output a replacement character. + const bool validCodePoint = QChar::isLowSurrogate(wc[1]); + int len = WideCharToMultiByte(codePage, 0, wc, validCodePoint ? 2 : 1, out, outlen, nullptr, + nullptr); + if (!len) + return {}; // Cannot recover, and I refuse to believe it was a size limitation + out += len; + outlen -= len; + if (validCodePoint) { + ++ch; + --uclen; + } + state->remainingChars = 0; + state->state_data[0] = 0; + if (uclen == 0) + return QByteArrayView(buf.data(), len).toByteArray(); + } + + if (state && QChar::isHighSurrogate(ch[uclen - 1])) { + // We can handle a missing low surrogate at the end of the string, + // so if there is one, exclude it now and store it in the state. + state->remainingChars = 1; + state->state_data[0] = ch[uclen - 1]; + --uclen; + if (uclen == 0) + return QByteArray(); + } + + Q_ASSERT(uclen > 0); + + // Return a pointer to storage where we have enough space for `size` + const auto growOut = [&](qsizetype size) -> std::tuple<char *, qsizetype> { + if (outlen >= size) + return {out, outlen}; + const bool wasStackBuffer = mb.isEmpty(); + const auto begin = wasStackBuffer ? buf.data() : mb.data(); + const qsizetype offset = qsizetype(std::distance(begin, out)); + qsizetype newSize = 0; + if (Q_UNLIKELY(qAddOverflow(offset, size, &newSize))) { + Q_CHECK_PTR(false); + return {nullptr, 0}; + } + mb.resize(newSize); + auto it = mb.data(); + if (wasStackBuffer) + it = std::copy_n(buf.data(), offset, it); + else + it += offset; + return {it, size}; + }; + + const auto getNextWindowSize = [&]() { + int nextIn = qt_saturate<int>(uclen); + // The Windows API has some issues if the current window ends in the + // middle of a surrogate pair, so we avoid that: + if (nextIn > 1 && QChar::isHighSurrogate(ch[nextIn - 1])) + --nextIn; + return nextIn; + }; + + int len = 0; + while (uclen > 0) { + const int nextIn = getNextWindowSize(); + std::tie(out, outlen) = growOut(1); // We need at least one byte + if (!out) + return {}; + const int nextOut = qt_saturate<int>(outlen); + len = WideCharToMultiByte(codePage, 0, ch, nextIn, out, nextOut, nullptr, nullptr); + if (len > 0) { + ch += nextIn; + uclen -= nextIn; + out += len; + outlen -= len; } else { - // Fail. Probably can't happen in fact (dwFlags is 0). + int r = GetLastError(); + if (r == ERROR_INSUFFICIENT_BUFFER) { + int neededLength = WideCharToMultiByte(codePage, 0, ch, nextIn, nullptr, 0, + nullptr, nullptr); + if (neededLength <= 0) { + // Fail. Observed with UTF8 where the input window was max int and ended in an + // incomplete sequence, probably a Windows bug. We try to avoid that from + // happening by reducing the window size in that case. But let's keep this + // branch just in case of other bugs. +#ifndef QT_NO_DEBUG + r = GetLastError(); + fprintf(stderr, + "WideCharToMultiByte: Cannot convert multibyte text (error %d)\n", r); +#endif // !QT_NO_DEBUG + break; + } + std::tie(out, outlen) = growOut(neededLength); + if (!out) + return {}; + // and try again... + } else { + // Fail. Probably can't happen in fact (dwFlags is 0). #ifndef QT_NO_DEBUG - // Can't use qWarning(), as it'll recurse to handle %ls - fprintf(stderr, - "WideCharToMultiByte: Cannot convert multibyte text (error %d): %ls\n", - r, reinterpret_cast<const wchar_t*>(QString(ch, uclen).utf16())); + // Can't use qWarning(), as it'll recurse to handle %ls + fprintf(stderr, + "WideCharToMultiByte: Cannot convert multibyte text (error %d): %ls\n", r, + reinterpret_cast<const wchar_t *>( + QStringView(ch, uclen).left(100).toString().utf16())); #endif - break; + break; + } } } - mb.resize(len); + if (mb.isEmpty()) { + // We must have only used the stack buffer + if (out != buf.data()) // else: we return null-array + mb = QByteArrayView(buf.data(), out).toByteArray(); + } else { + mb.truncate(std::distance(mb.data(), out)); + } return mb; } #endif @@ -1462,6 +1649,7 @@ void QStringConverter::State::reset() noexcept } } +#ifndef QT_BOOTSTRAPPED static QChar *fromUtf16(QChar *out, QByteArrayView in, QStringConverter::State *state) { return QUtf16::convertToUnicode(out, in, state, DetectEndianness); @@ -1521,6 +1709,7 @@ static char *toUtf32LE(char *out, QStringView in, QStringConverter::State *state { return QUtf32::convertFromUnicode(out, in, state, LittleEndianness); } +#endif // !QT_BOOTSTRAPPED char *QLatin1::convertFromUnicode(char *out, QStringView in, QStringConverter::State *state) noexcept { @@ -1562,11 +1751,13 @@ static char *toLocal8Bit(char *out, QStringView in, QStringConverter::State *sta static qsizetype fromUtf8Len(qsizetype l) { return l + 1; } static qsizetype toUtf8Len(qsizetype l) { return 3*(l + 1); } +#ifndef QT_BOOTSTRAPPED static qsizetype fromUtf16Len(qsizetype l) { return l/2 + 2; } static qsizetype toUtf16Len(qsizetype l) { return 2*(l + 1); } static qsizetype fromUtf32Len(qsizetype l) { return l/2 + 2; } static qsizetype toUtf32Len(qsizetype l) { return 4*(l + 1); } +#endif static qsizetype fromLatin1Len(qsizetype l) { return l + 1; } static qsizetype toLatin1Len(qsizetype l) { return l + 1; } @@ -1599,7 +1790,7 @@ static qsizetype toLatin1Len(qsizetype l) { return l + 1; } operation, encoding UTF-16 encoded data (usually in the form of a QString) to the requested encoding. - The supported encodings are: + The following encodings are always supported: \list \li UTF-8 @@ -1613,6 +1804,10 @@ static qsizetype toLatin1Len(qsizetype l) { return l + 1; } \li The system encoding \endlist + QStringConverter may support more encodings depending on how Qt was + compiled. If more codecs are supported, they can be listed using + availableCodecs(). + \l {QStringConverter}s can be used as follows to convert some encoded string to and from UTF-16. @@ -1702,12 +1897,14 @@ static qsizetype toLatin1Len(qsizetype l) { return l + 1; } const QStringConverter::Interface QStringConverter::encodingInterfaces[QStringConverter::LastEncoding + 1] = { { "UTF-8", QUtf8::convertToUnicode, fromUtf8Len, QUtf8::convertFromUnicode, toUtf8Len }, +#ifndef QT_BOOTSTRAPPED { "UTF-16", fromUtf16, fromUtf16Len, toUtf16, toUtf16Len }, { "UTF-16LE", fromUtf16LE, fromUtf16Len, toUtf16LE, toUtf16Len }, { "UTF-16BE", fromUtf16BE, fromUtf16Len, toUtf16BE, toUtf16Len }, { "UTF-32", fromUtf32, fromUtf32Len, toUtf32, toUtf32Len }, { "UTF-32LE", fromUtf32LE, fromUtf32Len, toUtf32LE, toUtf32Len }, { "UTF-32BE", fromUtf32BE, fromUtf32Len, toUtf32BE, toUtf32Len }, +#endif { "ISO-8859-1", QLatin1::convertToUnicode, fromLatin1Len, QLatin1::convertFromUnicode, toLatin1Len }, { "Locale", fromLocal8Bit, fromUtf8Len, toLocal8Bit, toUtf8Len } }; @@ -1715,21 +1912,16 @@ const QStringConverter::Interface QStringConverter::encodingInterfaces[QStringCo // match names case insensitive and skipping '-' and '_' static bool nameMatch(const char *a, const char *b) { - while (*a && *b) { - if (*a == '-' || *a == '_') { + do { + while (*a == '-' || *a == '_') ++a; - continue; - } - if (*b == '-' || *b == '_') { + while (*b == '-' || *b == '_') ++b; - continue; - } - if (QtMiscUtils::toAsciiLower(*a) != QtMiscUtils::toAsciiLower(*b)) - return false; - ++a; - ++b; - } - return !*a && !*b; + if (!*a && !*b) // end of both strings + return true; + } while (QtMiscUtils::toAsciiLower(*a++) == QtMiscUtils::toAsciiLower(*b++)); + + return false; } @@ -1784,7 +1976,7 @@ struct QStringConverterICU : QStringConverter const void *context; ucnv_getToUCallBack(icu_conv, &action, &context); if (context != state) - ucnv_setToUCallBack(icu_conv, action, &state, nullptr, nullptr, &err); + ucnv_setToUCallBack(icu_conv, action, state, nullptr, nullptr, &err); ucnv_toUnicode(icu_conv, &target, targetLimit, &source, sourceLimit, nullptr, flush, &err); // We did reserve enough space: @@ -1817,7 +2009,7 @@ struct QStringConverterICU : QStringConverter const void *context; ucnv_getFromUCallBack(icu_conv, &action, &context); if (context != state) - ucnv_setFromUCallBack(icu_conv, action, &state, nullptr, nullptr, &err); + ucnv_setFromUCallBack(icu_conv, action, state, nullptr, nullptr, &err); ucnv_fromUnicode(icu_conv, &target, targetLimit, &source, sourceLimit, nullptr, flush, &err); // We did reserve enough space: @@ -2032,6 +2224,7 @@ const char *QStringConverter::name() const noexcept Returns the canonical name of the encoding this QStringConverter can encode or decode. Returns a nullptr if the converter is not valid. + The returned name is UTF-8 encoded. \sa isValid() */ @@ -2043,9 +2236,13 @@ const char *QStringConverter::name() const noexcept \c{std::nullopt} is returned. Such a name may, none the less, be accepted by the QStringConverter constructor when Qt is built with ICU, if ICU provides a converter with the given name. + + \a name is expected to be UTF-8 encoded. */ std::optional<QStringConverter::Encoding> QStringConverter::encodingForName(const char *name) noexcept { + if (!name) + return std::nullopt; for (qsizetype i = 0; i < LastEncoding + 1; ++i) { if (nameMatch(encodingInterfaces[i].name, name)) return QStringConverter::Encoding(i); @@ -2055,6 +2252,7 @@ std::optional<QStringConverter::Encoding> QStringConverter::encodingForName(cons return std::nullopt; } +#ifndef QT_BOOTSTRAPPED /*! Returns the encoding for the content of \a data if it can be determined. \a expectedFirstCharacter can be passed as an additional hint to help determine @@ -2163,6 +2361,63 @@ std::optional<QStringConverter::Encoding> QStringConverter::encodingForHtml(QByt return Utf8; } +static qsizetype availableCodecCount() +{ +#if !QT_CONFIG(icu) + return QStringConverter::Encoding::LastEncoding; +#else + /* icu contains also the names of what Qt provides + except for the special Locale one (so add one for it) + */ + return 1 + ucnv_countAvailable(); +#endif +} + +/*! + Returns a list of names of supported codecs. The names returned + by this function can be passed to QStringEncoder's and + QStringDecoder's constructor to create a en- or decoder for + the given codec. + + This function may be used to obtain a listing of additional codecs beyond + the standard ones. Support for additional codecs requires Qt be compiled + with support for the ICU library. + + \note The order of codecs is an internal implementation detail + and not guaranteed to be stable. + */ +QStringList QStringConverter::availableCodecs() +{ + auto availableCodec = [](qsizetype index) -> QString + { + #if !QT_CONFIG(icu) + return QString::fromLatin1(encodingInterfaces[index].name); + #else + if (index == 0) // "Locale", not provided by icu + return QString::fromLatin1( + encodingInterfaces[QStringConverter::Encoding::System].name); + // this mirrors the setup we do to set a converters name + UErrorCode status = U_ZERO_ERROR; + auto icuName = ucnv_getAvailableName(int32_t(index - 1)); + const char *standardName = ucnv_getStandardName(icuName, "MIME", &status); + if (U_FAILURE(status) || !standardName) { + status = U_ZERO_ERROR; + standardName = ucnv_getStandardName(icuName, "IANA", &status); + } + if (!standardName) + standardName = icuName; + return QString::fromLatin1(standardName); + #endif + }; + + qsizetype codecCount = availableCodecCount(); + QStringList result; + result.reserve(codecCount); + for (qsizetype i = 0; i < codecCount; ++i) + result.push_back(availableCodec(i)); + return result; +} + /*! Tries to determine the encoding of the HTML in \a data by looking at leading byte order marks or a charset specifier in the HTML meta tag and returns a QStringDecoder @@ -2186,7 +2441,7 @@ QStringDecoder QStringDecoder::decoderForHtml(QByteArrayView data) return QStringDecoder(Utf8); } - +#endif // !QT_BOOTSTRAPPED /*! Returns the canonical name for encoding \a e. @@ -2255,12 +2510,24 @@ const char *QStringConverter::nameForEncoding(QStringConverter::Encoding e) */ /*! - \fn QByteArray QStringEncoder::encode(const QString &in) - \fn QByteArray QStringEncoder::encode(QStringView in) - \fn QByteArray QStringEncoder::operator()(const QString &in) - \fn QByteArray QStringEncoder::operator()(QStringView in) + \fn constexpr QStringEncoder::QStringEncoder(const QString &name, Flags flags = Flag::Default) + \since 6.8 + + Creates an encoder object using \a name and \a flags. + If \a name is not the name of a known encoding an invalid converter will get created. - Converts \a in and returns the data as a byte array. + \sa isValid() +*/ + +/*! + \fn QStringEncoder::DecodedData<const QString &> QStringEncoder::encode(const QString &in) + \fn QStringEncoder::DecodedData<QStringView> QStringEncoder::encode(QStringView in) + \fn QStringEncoder::DecodedData<const QString &> QStringEncoder::operator()(const QString &in) + \fn QStringEncoder::DecodedData<QStringView> QStringEncoder::operator()(QStringView in) + + Converts \a in and returns a struct that is implicitly convertible to QByteArray. + + \snippet code/src_corelib_text_qstringconverter.cpp 5 */ /*! @@ -2344,12 +2611,25 @@ const char *QStringConverter::nameForEncoding(QStringConverter::Encoding e) */ /*! - \fn QString QStringDecoder::operator()(const QByteArray &ba) - \fn QString QStringDecoder::decode(const QByteArray &ba) - \fn QString QStringDecoder::operator()(QByteArrayView ba) - \fn QString QStringDecoder::decode(QByteArrayView ba) + \fn constexpr QStringDecoder::QStringDecoder(const QString &name, Flags flags = Flag::Default) + \since 6.8 + + Creates an decoder object using \a name and \a flags. + If \a name is not the name of a known encoding an invalid converter will get created. - Converts \a ba and returns the data as a QString. + \sa isValid() +*/ + +/*! + \fn QStringDecoder::EncodedData<const QByteArray &> QStringDecoder::operator()(const QByteArray &ba) + \fn QStringDecoder::EncodedData<const QByteArray &> QStringDecoder::decode(const QByteArray &ba) + \fn QStringDecoder::EncodedData<QByteArrayView> QStringDecoder::operator()(QByteArrayView ba) + \fn QStringDecoder::EncodedData<QByteArrayView> QStringDecoder::decode(QByteArrayView ba) + + Converts \a ba and returns a struct that is implicitly convertible to QString. + + + \snippet code/src_corelib_text_qstringconverter.cpp 4 */ /*! @@ -2374,4 +2654,10 @@ const char *QStringConverter::nameForEncoding(QStringConverter::Encoding e) \sa requiredSpace */ +/*! + \fn char16_t *QStringDecoder::appendToBuffer(char16_t *out, QByteArrayView in) + \since 6.6 + \overload +*/ + QT_END_NAMESPACE |