summaryrefslogtreecommitdiffstats
path: root/src/corelib/text/qstringconverter.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'src/corelib/text/qstringconverter.cpp')
-rw-r--r--src/corelib/text/qstringconverter.cpp624
1 files changed, 455 insertions, 169 deletions
diff --git a/src/corelib/text/qstringconverter.cpp b/src/corelib/text/qstringconverter.cpp
index 90f277ba61..67c75d708e 100644
--- a/src/corelib/text/qstringconverter.cpp
+++ b/src/corelib/text/qstringconverter.cpp
@@ -10,6 +10,8 @@
#include "private/qstringiterator_p.h"
#include "private/qtools_p.h"
#include "qbytearraymatcher.h"
+#include "qcontainertools_impl.h"
+#include <QtCore/qbytearraylist.h>
#if QT_CONFIG(icu)
#include <unicode/ucnv.h>
@@ -22,9 +24,13 @@
#include <qt_windows.h>
#ifndef QT_BOOTSTRAPPED
#include <QtCore/qvarlengtharray.h>
+#include <QtCore/q20iterator.h>
+#include <QtCore/private/qnumeric_p.h>
#endif // !QT_BOOTSTRAPPED
#endif
+#include <array>
+
#if __has_include(<bit>) && __cplusplus > 201703L
#include <bit>
#endif
@@ -202,7 +208,7 @@ static inline const uchar *simdFindNonAscii(const uchar *src, const uchar *end,
continue;
uint n = _mm256_movemask_epi8(data);
- Q_ASSUME(n);
+ Q_ASSERT(n);
// find the next probable ASCII character
// we don't want to load 32 bytes again in this loop if we know there are non-ASCII
@@ -936,6 +942,7 @@ int QUtf8::compareUtf8(QByteArrayView lhs, QByteArrayView rhs, Qt::CaseSensitivi
return (end1 > src1) - (end2 > src2);
}
+#ifndef QT_BOOTSTRAPPED
QByteArray QUtf16::convertFromUnicode(QStringView in, QStringConverter::State *state, DataEndianness endian)
{
bool writeBom = !(state->internalState & HeaderDone) && state->flags & QStringConverter::Flag::WriteBom;
@@ -1245,6 +1252,7 @@ QChar *QUtf32::convertToUnicode(QChar *out, QByteArrayView in, QStringConverter:
return out;
}
+#endif // !QT_BOOTSTRAPPED
#if defined(Q_OS_WIN) && !defined(QT_BOOTSTRAPPED)
int QLocal8Bit::checkUtf8()
@@ -1252,186 +1260,365 @@ int QLocal8Bit::checkUtf8()
return GetACP() == CP_UTF8 ? 1 : -1;
}
-static QString convertToUnicodeCharByChar(QByteArrayView in, QStringConverter::State *state)
+QString QLocal8Bit::convertToUnicode_sys(QByteArrayView in, QStringConverter::State *state)
{
- qsizetype length = in.size();
- const char *chars = in.data();
-
- Q_ASSERT(state);
- if (state->flags & QStringConverter::Flag::Stateless) // temporary
- state = nullptr;
-
- if (!chars || !length)
- return QString();
-
- qsizetype copyLocation = 0;
- qsizetype extra = 2;
- if (state && state->remainingChars) {
- copyLocation = state->remainingChars;
- extra += copyLocation;
- }
- qsizetype newLength = length + extra;
- char *mbcs = new char[newLength];
- //ensure that we have a NULL terminated string
- mbcs[newLength-1] = 0;
- mbcs[newLength-2] = 0;
- memcpy(&(mbcs[copyLocation]), chars, length);
- if (copyLocation) {
- //copy the last character from the state
- mbcs[0] = (char)state->state_data[0];
- state->remainingChars = 0;
- }
- const char *mb = mbcs;
- const char *next = 0;
- QString s;
- while ((next = CharNextExA(CP_ACP, mb, 0)) != mb) {
- wchar_t wc[2] ={0};
- int charlength = int(next - mb); // always just a few bytes
- int len = MultiByteToWideChar(CP_ACP, MB_PRECOMPOSED|MB_ERR_INVALID_CHARS, mb, charlength, wc, 2);
- if (len>0) {
- s.append(QChar(wc[0]));
- } else {
- int r = GetLastError();
- //check if the character being dropped is the last character
- if (r == ERROR_NO_UNICODE_TRANSLATION && mb == (mbcs+newLength -3) && state) {
- state->remainingChars = 1;
- state->state_data[0] = (char)*mb;
- }
- }
- mb = next;
- }
- delete [] mbcs;
- return s;
+ return convertToUnicode_sys(in, CP_ACP, state);
}
-
-QString QLocal8Bit::convertToUnicode_sys(QByteArrayView in, QStringConverter::State *state)
+QString QLocal8Bit::convertToUnicode_sys(QByteArrayView in, quint32 codePage,
+ QStringConverter::State *state)
{
- qsizetype length = in.size();
-
- Q_ASSERT(length < INT_MAX); // ### FIXME
const char *mb = in.data();
- int mblen = length;
+ qsizetype mblen = in.size();
+
+ Q_ASSERT(state);
+ qsizetype &invalidChars = state->invalidChars;
+ using Flag = QStringConverter::Flag;
+ const bool useNullForReplacement = !!(state->flags & Flag::ConvertInvalidToNull);
+ const char16_t replacementCharacter = useNullForReplacement ? QChar::Null
+ : QChar::ReplacementCharacter;
+ if (state->flags & Flag::Stateless) {
+ Q_ASSERT(state->remainingChars == 0);
+ state = nullptr;
+ }
if (!mb || !mblen)
return QString();
- QVarLengthArray<wchar_t, 4096> wc(4096);
- int len;
+ // Use a local stack-buffer at first to allow us a decently large container
+ // to avoid a lot of resizing, without also returning an overallocated
+ // QString to the user for small strings.
+ // Then we can be fast for small strings and take the hit of extra resizes
+ // and measuring how much storage is needed for large strings.
+ std::array<wchar_t, 4096> buf;
+ wchar_t *out = buf.data();
+ qsizetype outlen = buf.size();
+
QString sp;
- bool prepend = false;
- char state_data = 0;
- int remainingChars = 0;
-
- //save the current state information
- if (state) {
- state_data = (char)state->state_data[0];
- remainingChars = state->remainingChars;
- }
- //convert the pending character (if available)
- if (state && remainingChars) {
- char prev[3] = {0};
- prev[0] = state_data;
- prev[1] = mb[0];
- remainingChars = 0;
- len = MultiByteToWideChar(CP_ACP, MB_PRECOMPOSED,
- prev, 2, wc.data(), wc.length());
- if (len) {
- sp.append(QChar(wc[0]));
- if (mblen == 1) {
- state->remainingChars = 0;
- return sp;
- }
- prepend = true;
- mb++;
- mblen--;
- wc[0] = 0;
+ // Return a pointer to storage where we have enough space for `size`
+ const auto growOut = [&](qsizetype size) -> std::tuple<wchar_t *, qsizetype> {
+ if (outlen >= size)
+ return {out, outlen};
+ const bool wasStackBuffer = sp.isEmpty();
+ const auto begin = wasStackBuffer ? buf.data() : reinterpret_cast<wchar_t *>(sp.data());
+ const qsizetype offset = qsizetype(std::distance(begin, out));
+ qsizetype newSize = 0;
+ if (Q_UNLIKELY(qAddOverflow(offset, size, &newSize))) {
+ Q_CHECK_PTR(false);
+ return {nullptr, 0};
+ }
+ sp.resize(newSize);
+ auto it = reinterpret_cast<wchar_t *>(sp.data());
+ if (wasStackBuffer)
+ it = std::copy_n(buf.data(), offset, it);
+ else
+ it += offset;
+ return {it, size};
+ };
+
+ // Convert the pending characters (if available)
+ while (state && state->remainingChars && mblen) {
+ QStringConverter::State localState;
+ localState.flags = state->flags;
+ // Use at most 6 characters as a guess for the longest encoded character
+ // in any multibyte encoding.
+ // Even with a total of 2 bytes of overhead that would leave around
+ // 2^(4 * 8) possible characters
+ std::array<char, 6> prev = {0};
+ Q_ASSERT(state->remainingChars <= q20::ssize(state->state_data));
+ qsizetype index = 0;
+ for (; index < state->remainingChars; ++index)
+ prev[index] = state->state_data[index];
+ const qsizetype toCopy = std::min(q20::ssize(prev) - index, mblen);
+ for (qsizetype i = 0; i < toCopy; ++i, ++index)
+ prev[index] = mb[i];
+ mb += toCopy;
+ mblen -= toCopy;
+
+ // Recursing:
+ // Since we are using a clean local state it will try to decode what was
+ // stored in our state + some extra octets from input (`prev`). If some
+ // part fails we will have those characters stored in the local state's
+ // storage, and we can extract those. It may also output some
+ // replacement characters, which we'll count in the invalidChars.
+ // In the best case we only do this once, but we will loop until we have
+ // resolved all the remaining characters or we have run out of new input
+ // in which case we may still have remaining characters.
+ const QString tmp = convertToUnicode_sys(QByteArrayView(prev.data(), index), codePage,
+ &localState);
+ std::tie(out, outlen) = growOut(tmp.size());
+ if (!out)
+ return {};
+ out = std::copy_n(reinterpret_cast<const wchar_t *>(tmp.constData()), tmp.size(), out);
+ outlen -= tmp.size();
+ const qsizetype tail = toCopy - localState.remainingChars;
+ if (tail >= 0) {
+ // Everything left to process comes from `in`, so we can stop
+ // looping. Adjust the window for `in` and unset remainingChars to
+ // signal that we're done.
+ mb -= localState.remainingChars;
+ mblen += localState.remainingChars;
+ localState.remainingChars = 0;
}
+ state->remainingChars = localState.remainingChars;
+ state->invalidChars += localState.invalidChars;
+ std::copy_n(localState.state_data, state->remainingChars, state->state_data);
}
- while (!(len=MultiByteToWideChar(CP_ACP, MB_PRECOMPOSED|MB_ERR_INVALID_CHARS,
- mb, mblen, wc.data(), wc.length()))) {
- int r = GetLastError();
- if (r == ERROR_INSUFFICIENT_BUFFER) {
- const int wclen = MultiByteToWideChar(CP_ACP, MB_PRECOMPOSED,
- mb, mblen, 0, 0);
- wc.resize(wclen);
- } else if (r == ERROR_NO_UNICODE_TRANSLATION) {
- //find the last non NULL character
- while (mblen > 1 && !(mb[mblen-1]))
- mblen--;
- //check whether, we hit an invalid character in the middle
- if ((mblen <= 1) || (remainingChars && state_data))
- return convertToUnicodeCharByChar(in, state);
- //Remove the last character and try again...
- state_data = mb[mblen-1];
- remainingChars = 1;
- mblen--;
+ Q_ASSERT(!state || state->remainingChars == 0 || mblen == 0);
+
+ // Need it in this scope, since we try to decrease our window size if we
+ // encounter an error
+ int nextIn = qt_saturate<int>(mblen);
+ while (mblen > 0) {
+ std::tie(out, outlen) = growOut(1); // Need space for at least one character
+ if (!out)
+ return {};
+ const int nextOut = qt_saturate<int>(outlen);
+ int len = MultiByteToWideChar(codePage, MB_ERR_INVALID_CHARS, mb, nextIn, out, nextOut);
+ if (len) {
+ mb += nextIn;
+ mblen -= nextIn;
+ out += len;
+ outlen -= len;
} else {
- // Fail.
- qWarning("MultiByteToWideChar: Cannot convert multibyte text");
- break;
+ int r = GetLastError();
+ if (r == ERROR_INSUFFICIENT_BUFFER) {
+ const int wclen = MultiByteToWideChar(codePage, 0, mb, nextIn, 0, 0);
+ std::tie(out, outlen) = growOut(wclen);
+ if (!out)
+ return {};
+ } else if (r == ERROR_NO_UNICODE_TRANSLATION) {
+ // Can't decode the current window, so either store the state,
+ // reduce window size or output a replacement character.
+
+ // Check if we can store all remaining characters in the state
+ // to be used next time we're called:
+ if (state && mblen <= q20::ssize(state->state_data)) {
+ state->remainingChars = mblen;
+ std::copy_n(mb, mblen, state->state_data);
+ mb += mblen;
+ mblen = 0;
+ break;
+ }
+
+ // .. if not, try to find the last valid character in the window
+ // and try again with a shrunken window:
+ if (nextIn > 1) {
+ // There may be some incomplete data at the end of our current
+ // window, so decrease the window size and try again.
+ // In the worst case scenario there is gigs of undecodable
+ // garbage, but what are we supposed to do about that?
+ const auto it = CharPrevExA(codePage, mb, mb + nextIn, 0);
+ if (it != mb)
+ nextIn = int(it - mb);
+ else
+ --nextIn;
+ continue;
+ }
+
+ // Finally, we are forced to output a replacement character for
+ // the first byte in the window:
+ std::tie(out, outlen) = growOut(1);
+ if (!out)
+ return {};
+ *out = replacementCharacter;
+ ++invalidChars;
+ ++out;
+ --outlen;
+ ++mb;
+ --mblen;
+ } else {
+ // Fail.
+ qWarning("MultiByteToWideChar: Cannot convert multibyte text");
+ break;
+ }
}
+ nextIn = qt_saturate<int>(mblen);
}
- if (len <= 0)
- return QString();
+ if (sp.isEmpty()) {
+ // We must have only used the stack buffer
+ if (out != buf.data()) // else: we return null-string
+ sp = QStringView(buf.data(), out).toString();
+ } else{
+ const auto begin = reinterpret_cast<wchar_t *>(sp.data());
+ sp.truncate(std::distance(begin, out));
+ }
- if (wc[len-1] == 0) // len - 1: we don't want terminator
- --len;
+ if (sp.size() && sp.back().isNull())
+ sp.chop(1);
- //save the new state information
- if (state) {
- state->state_data[0] = (char)state_data;
- state->remainingChars = remainingChars;
+ if (!state && mblen > 0) {
+ // We have trailing character(s) that could not be converted, and
+ // nowhere to cache them
+ sp.resize(sp.size() + mblen, replacementCharacter);
+ invalidChars += mblen;
}
- QString s((QChar*)wc.data(), len);
- if (prepend) {
- return sp+s;
- }
- return s;
+ return sp;
}
QByteArray QLocal8Bit::convertFromUnicode_sys(QStringView in, QStringConverter::State *state)
{
- const QChar *ch = in.data();
+ return convertFromUnicode_sys(in, CP_ACP, state);
+}
+
+QByteArray QLocal8Bit::convertFromUnicode_sys(QStringView in, quint32 codePage,
+ QStringConverter::State *state)
+{
+ const wchar_t *ch = reinterpret_cast<const wchar_t *>(in.data());
qsizetype uclen = in.size();
- Q_ASSERT(uclen < INT_MAX); // ### FIXME
Q_ASSERT(state);
- Q_UNUSED(state); // ### Fixme
- if (state->flags & QStringConverter::Flag::Stateless) // temporary
+ // The Windows API has a *boolean* out-parameter that says if a replacement
+ // character was used, but it gives us no way to know _how many_ were used.
+ // Since we cannot simply scan the string for replacement characters
+ // (which is potentially a question mark, and thus a valid character),
+ // we simply do not track the number of invalid characters here.
+ // auto &invalidChars = state->invalidChars;
+
+ using Flag = QStringConverter::Flag;
+ if (state->flags & Flag::Stateless) { // temporary
+ Q_ASSERT(state->remainingChars == 0);
state = nullptr;
+ }
if (!ch)
return QByteArray();
if (uclen == 0)
return QByteArray("");
- BOOL used_def;
- QByteArray mb(4096, 0);
- int len;
- while (!(len=WideCharToMultiByte(CP_ACP, 0, (const wchar_t*)ch, uclen,
- mb.data(), mb.size()-1, 0, &used_def)))
- {
- int r = GetLastError();
- if (r == ERROR_INSUFFICIENT_BUFFER) {
- mb.resize(1+WideCharToMultiByte(CP_ACP, 0,
- (const wchar_t*)ch, uclen,
- 0, 0, 0, &used_def));
- // and try again...
+
+ // Use a local stack-buffer at first to allow us a decently large container
+ // to avoid a lot of resizing, without also returning an overallocated
+ // QByteArray to the user for small strings.
+ // Then we can be fast for small strings and take the hit of extra resizes
+ // and measuring how much storage is needed for large strings.
+ std::array<char, 4096> buf;
+ char *out = buf.data();
+ qsizetype outlen = buf.size();
+ QByteArray mb;
+
+ if (state && state->remainingChars > 0) {
+ Q_ASSERT(state->remainingChars == 1);
+ // Let's try to decode the pending character
+ wchar_t wc[2] = { wchar_t(state->state_data[0]), ch[0] };
+ // Check if the second character is a valid low surrogate,
+ // otherwise we'll just decode the first character, for which windows
+ // will output a replacement character.
+ const bool validCodePoint = QChar::isLowSurrogate(wc[1]);
+ int len = WideCharToMultiByte(codePage, 0, wc, validCodePoint ? 2 : 1, out, outlen, nullptr,
+ nullptr);
+ if (!len)
+ return {}; // Cannot recover, and I refuse to believe it was a size limitation
+ out += len;
+ outlen -= len;
+ if (validCodePoint) {
+ ++ch;
+ --uclen;
+ }
+ state->remainingChars = 0;
+ state->state_data[0] = 0;
+ if (uclen == 0)
+ return QByteArrayView(buf.data(), len).toByteArray();
+ }
+
+ if (state && QChar::isHighSurrogate(ch[uclen - 1])) {
+ // We can handle a missing low surrogate at the end of the string,
+ // so if there is one, exclude it now and store it in the state.
+ state->remainingChars = 1;
+ state->state_data[0] = ch[uclen - 1];
+ --uclen;
+ if (uclen == 0)
+ return QByteArray();
+ }
+
+ Q_ASSERT(uclen > 0);
+
+ // Return a pointer to storage where we have enough space for `size`
+ const auto growOut = [&](qsizetype size) -> std::tuple<char *, qsizetype> {
+ if (outlen >= size)
+ return {out, outlen};
+ const bool wasStackBuffer = mb.isEmpty();
+ const auto begin = wasStackBuffer ? buf.data() : mb.data();
+ const qsizetype offset = qsizetype(std::distance(begin, out));
+ qsizetype newSize = 0;
+ if (Q_UNLIKELY(qAddOverflow(offset, size, &newSize))) {
+ Q_CHECK_PTR(false);
+ return {nullptr, 0};
+ }
+ mb.resize(newSize);
+ auto it = mb.data();
+ if (wasStackBuffer)
+ it = std::copy_n(buf.data(), offset, it);
+ else
+ it += offset;
+ return {it, size};
+ };
+
+ const auto getNextWindowSize = [&]() {
+ int nextIn = qt_saturate<int>(uclen);
+ // The Windows API has some issues if the current window ends in the
+ // middle of a surrogate pair, so we avoid that:
+ if (nextIn > 1 && QChar::isHighSurrogate(ch[nextIn - 1]))
+ --nextIn;
+ return nextIn;
+ };
+
+ int len = 0;
+ while (uclen > 0) {
+ const int nextIn = getNextWindowSize();
+ std::tie(out, outlen) = growOut(1); // We need at least one byte
+ if (!out)
+ return {};
+ const int nextOut = qt_saturate<int>(outlen);
+ len = WideCharToMultiByte(codePage, 0, ch, nextIn, out, nextOut, nullptr, nullptr);
+ if (len > 0) {
+ ch += nextIn;
+ uclen -= nextIn;
+ out += len;
+ outlen -= len;
} else {
- // Fail. Probably can't happen in fact (dwFlags is 0).
+ int r = GetLastError();
+ if (r == ERROR_INSUFFICIENT_BUFFER) {
+ int neededLength = WideCharToMultiByte(codePage, 0, ch, nextIn, nullptr, 0,
+ nullptr, nullptr);
+ if (neededLength <= 0) {
+ // Fail. Observed with UTF8 where the input window was max int and ended in an
+ // incomplete sequence, probably a Windows bug. We try to avoid that from
+ // happening by reducing the window size in that case. But let's keep this
+ // branch just in case of other bugs.
+#ifndef QT_NO_DEBUG
+ r = GetLastError();
+ fprintf(stderr,
+ "WideCharToMultiByte: Cannot convert multibyte text (error %d)\n", r);
+#endif // !QT_NO_DEBUG
+ break;
+ }
+ std::tie(out, outlen) = growOut(neededLength);
+ if (!out)
+ return {};
+ // and try again...
+ } else {
+ // Fail. Probably can't happen in fact (dwFlags is 0).
#ifndef QT_NO_DEBUG
- // Can't use qWarning(), as it'll recurse to handle %ls
- fprintf(stderr,
- "WideCharToMultiByte: Cannot convert multibyte text (error %d): %ls\n",
- r, reinterpret_cast<const wchar_t*>(QString(ch, uclen).utf16()));
+ // Can't use qWarning(), as it'll recurse to handle %ls
+ fprintf(stderr,
+ "WideCharToMultiByte: Cannot convert multibyte text (error %d): %ls\n", r,
+ reinterpret_cast<const wchar_t *>(
+ QStringView(ch, uclen).left(100).toString().utf16()));
#endif
- break;
+ break;
+ }
}
}
- mb.resize(len);
+ if (mb.isEmpty()) {
+ // We must have only used the stack buffer
+ if (out != buf.data()) // else: we return null-array
+ mb = QByteArrayView(buf.data(), out).toByteArray();
+ } else {
+ mb.truncate(std::distance(mb.data(), out));
+ }
return mb;
}
#endif
@@ -1462,6 +1649,7 @@ void QStringConverter::State::reset() noexcept
}
}
+#ifndef QT_BOOTSTRAPPED
static QChar *fromUtf16(QChar *out, QByteArrayView in, QStringConverter::State *state)
{
return QUtf16::convertToUnicode(out, in, state, DetectEndianness);
@@ -1521,6 +1709,7 @@ static char *toUtf32LE(char *out, QStringView in, QStringConverter::State *state
{
return QUtf32::convertFromUnicode(out, in, state, LittleEndianness);
}
+#endif // !QT_BOOTSTRAPPED
char *QLatin1::convertFromUnicode(char *out, QStringView in, QStringConverter::State *state) noexcept
{
@@ -1562,11 +1751,13 @@ static char *toLocal8Bit(char *out, QStringView in, QStringConverter::State *sta
static qsizetype fromUtf8Len(qsizetype l) { return l + 1; }
static qsizetype toUtf8Len(qsizetype l) { return 3*(l + 1); }
+#ifndef QT_BOOTSTRAPPED
static qsizetype fromUtf16Len(qsizetype l) { return l/2 + 2; }
static qsizetype toUtf16Len(qsizetype l) { return 2*(l + 1); }
static qsizetype fromUtf32Len(qsizetype l) { return l/2 + 2; }
static qsizetype toUtf32Len(qsizetype l) { return 4*(l + 1); }
+#endif
static qsizetype fromLatin1Len(qsizetype l) { return l + 1; }
static qsizetype toLatin1Len(qsizetype l) { return l + 1; }
@@ -1599,7 +1790,7 @@ static qsizetype toLatin1Len(qsizetype l) { return l + 1; }
operation, encoding UTF-16 encoded data (usually in the form of a QString) to
the requested encoding.
- The supported encodings are:
+ The following encodings are always supported:
\list
\li UTF-8
@@ -1613,6 +1804,10 @@ static qsizetype toLatin1Len(qsizetype l) { return l + 1; }
\li The system encoding
\endlist
+ QStringConverter may support more encodings depending on how Qt was
+ compiled. If more codecs are supported, they can be listed using
+ availableCodecs().
+
\l {QStringConverter}s can be used as follows to convert some encoded
string to and from UTF-16.
@@ -1702,12 +1897,14 @@ static qsizetype toLatin1Len(qsizetype l) { return l + 1; }
const QStringConverter::Interface QStringConverter::encodingInterfaces[QStringConverter::LastEncoding + 1] =
{
{ "UTF-8", QUtf8::convertToUnicode, fromUtf8Len, QUtf8::convertFromUnicode, toUtf8Len },
+#ifndef QT_BOOTSTRAPPED
{ "UTF-16", fromUtf16, fromUtf16Len, toUtf16, toUtf16Len },
{ "UTF-16LE", fromUtf16LE, fromUtf16Len, toUtf16LE, toUtf16Len },
{ "UTF-16BE", fromUtf16BE, fromUtf16Len, toUtf16BE, toUtf16Len },
{ "UTF-32", fromUtf32, fromUtf32Len, toUtf32, toUtf32Len },
{ "UTF-32LE", fromUtf32LE, fromUtf32Len, toUtf32LE, toUtf32Len },
{ "UTF-32BE", fromUtf32BE, fromUtf32Len, toUtf32BE, toUtf32Len },
+#endif
{ "ISO-8859-1", QLatin1::convertToUnicode, fromLatin1Len, QLatin1::convertFromUnicode, toLatin1Len },
{ "Locale", fromLocal8Bit, fromUtf8Len, toLocal8Bit, toUtf8Len }
};
@@ -1715,21 +1912,16 @@ const QStringConverter::Interface QStringConverter::encodingInterfaces[QStringCo
// match names case insensitive and skipping '-' and '_'
static bool nameMatch(const char *a, const char *b)
{
- while (*a && *b) {
- if (*a == '-' || *a == '_') {
+ do {
+ while (*a == '-' || *a == '_')
++a;
- continue;
- }
- if (*b == '-' || *b == '_') {
+ while (*b == '-' || *b == '_')
++b;
- continue;
- }
- if (QtMiscUtils::toAsciiLower(*a) != QtMiscUtils::toAsciiLower(*b))
- return false;
- ++a;
- ++b;
- }
- return !*a && !*b;
+ if (!*a && !*b) // end of both strings
+ return true;
+ } while (QtMiscUtils::toAsciiLower(*a++) == QtMiscUtils::toAsciiLower(*b++));
+
+ return false;
}
@@ -1784,7 +1976,7 @@ struct QStringConverterICU : QStringConverter
const void *context;
ucnv_getToUCallBack(icu_conv, &action, &context);
if (context != state)
- ucnv_setToUCallBack(icu_conv, action, &state, nullptr, nullptr, &err);
+ ucnv_setToUCallBack(icu_conv, action, state, nullptr, nullptr, &err);
ucnv_toUnicode(icu_conv, &target, targetLimit, &source, sourceLimit, nullptr, flush, &err);
// We did reserve enough space:
@@ -1817,7 +2009,7 @@ struct QStringConverterICU : QStringConverter
const void *context;
ucnv_getFromUCallBack(icu_conv, &action, &context);
if (context != state)
- ucnv_setFromUCallBack(icu_conv, action, &state, nullptr, nullptr, &err);
+ ucnv_setFromUCallBack(icu_conv, action, state, nullptr, nullptr, &err);
ucnv_fromUnicode(icu_conv, &target, targetLimit, &source, sourceLimit, nullptr, flush, &err);
// We did reserve enough space:
@@ -2032,6 +2224,7 @@ const char *QStringConverter::name() const noexcept
Returns the canonical name of the encoding this QStringConverter can encode or decode.
Returns a nullptr if the converter is not valid.
+ The returned name is UTF-8 encoded.
\sa isValid()
*/
@@ -2043,9 +2236,13 @@ const char *QStringConverter::name() const noexcept
\c{std::nullopt} is returned. Such a name may, none the less, be accepted by
the QStringConverter constructor when Qt is built with ICU, if ICU provides a
converter with the given name.
+
+ \a name is expected to be UTF-8 encoded.
*/
std::optional<QStringConverter::Encoding> QStringConverter::encodingForName(const char *name) noexcept
{
+ if (!name)
+ return std::nullopt;
for (qsizetype i = 0; i < LastEncoding + 1; ++i) {
if (nameMatch(encodingInterfaces[i].name, name))
return QStringConverter::Encoding(i);
@@ -2055,6 +2252,7 @@ std::optional<QStringConverter::Encoding> QStringConverter::encodingForName(cons
return std::nullopt;
}
+#ifndef QT_BOOTSTRAPPED
/*!
Returns the encoding for the content of \a data if it can be determined.
\a expectedFirstCharacter can be passed as an additional hint to help determine
@@ -2163,6 +2361,63 @@ std::optional<QStringConverter::Encoding> QStringConverter::encodingForHtml(QByt
return Utf8;
}
+static qsizetype availableCodecCount()
+{
+#if !QT_CONFIG(icu)
+ return QStringConverter::Encoding::LastEncoding;
+#else
+ /* icu contains also the names of what Qt provides
+ except for the special Locale one (so add one for it)
+ */
+ return 1 + ucnv_countAvailable();
+#endif
+}
+
+/*!
+ Returns a list of names of supported codecs. The names returned
+ by this function can be passed to QStringEncoder's and
+ QStringDecoder's constructor to create a en- or decoder for
+ the given codec.
+
+ This function may be used to obtain a listing of additional codecs beyond
+ the standard ones. Support for additional codecs requires Qt be compiled
+ with support for the ICU library.
+
+ \note The order of codecs is an internal implementation detail
+ and not guaranteed to be stable.
+ */
+QStringList QStringConverter::availableCodecs()
+{
+ auto availableCodec = [](qsizetype index) -> QString
+ {
+ #if !QT_CONFIG(icu)
+ return QString::fromLatin1(encodingInterfaces[index].name);
+ #else
+ if (index == 0) // "Locale", not provided by icu
+ return QString::fromLatin1(
+ encodingInterfaces[QStringConverter::Encoding::System].name);
+ // this mirrors the setup we do to set a converters name
+ UErrorCode status = U_ZERO_ERROR;
+ auto icuName = ucnv_getAvailableName(int32_t(index - 1));
+ const char *standardName = ucnv_getStandardName(icuName, "MIME", &status);
+ if (U_FAILURE(status) || !standardName) {
+ status = U_ZERO_ERROR;
+ standardName = ucnv_getStandardName(icuName, "IANA", &status);
+ }
+ if (!standardName)
+ standardName = icuName;
+ return QString::fromLatin1(standardName);
+ #endif
+ };
+
+ qsizetype codecCount = availableCodecCount();
+ QStringList result;
+ result.reserve(codecCount);
+ for (qsizetype i = 0; i < codecCount; ++i)
+ result.push_back(availableCodec(i));
+ return result;
+}
+
/*!
Tries to determine the encoding of the HTML in \a data by looking at leading byte
order marks or a charset specifier in the HTML meta tag and returns a QStringDecoder
@@ -2186,7 +2441,7 @@ QStringDecoder QStringDecoder::decoderForHtml(QByteArrayView data)
return QStringDecoder(Utf8);
}
-
+#endif // !QT_BOOTSTRAPPED
/*!
Returns the canonical name for encoding \a e.
@@ -2255,12 +2510,24 @@ const char *QStringConverter::nameForEncoding(QStringConverter::Encoding e)
*/
/*!
- \fn QByteArray QStringEncoder::encode(const QString &in)
- \fn QByteArray QStringEncoder::encode(QStringView in)
- \fn QByteArray QStringEncoder::operator()(const QString &in)
- \fn QByteArray QStringEncoder::operator()(QStringView in)
+ \fn constexpr QStringEncoder::QStringEncoder(const QString &name, Flags flags = Flag::Default)
+ \since 6.8
+
+ Creates an encoder object using \a name and \a flags.
+ If \a name is not the name of a known encoding an invalid converter will get created.
- Converts \a in and returns the data as a byte array.
+ \sa isValid()
+*/
+
+/*!
+ \fn QStringEncoder::DecodedData<const QString &> QStringEncoder::encode(const QString &in)
+ \fn QStringEncoder::DecodedData<QStringView> QStringEncoder::encode(QStringView in)
+ \fn QStringEncoder::DecodedData<const QString &> QStringEncoder::operator()(const QString &in)
+ \fn QStringEncoder::DecodedData<QStringView> QStringEncoder::operator()(QStringView in)
+
+ Converts \a in and returns a struct that is implicitly convertible to QByteArray.
+
+ \snippet code/src_corelib_text_qstringconverter.cpp 5
*/
/*!
@@ -2344,12 +2611,25 @@ const char *QStringConverter::nameForEncoding(QStringConverter::Encoding e)
*/
/*!
- \fn QString QStringDecoder::operator()(const QByteArray &ba)
- \fn QString QStringDecoder::decode(const QByteArray &ba)
- \fn QString QStringDecoder::operator()(QByteArrayView ba)
- \fn QString QStringDecoder::decode(QByteArrayView ba)
+ \fn constexpr QStringDecoder::QStringDecoder(const QString &name, Flags flags = Flag::Default)
+ \since 6.8
+
+ Creates an decoder object using \a name and \a flags.
+ If \a name is not the name of a known encoding an invalid converter will get created.
- Converts \a ba and returns the data as a QString.
+ \sa isValid()
+*/
+
+/*!
+ \fn QStringDecoder::EncodedData<const QByteArray &> QStringDecoder::operator()(const QByteArray &ba)
+ \fn QStringDecoder::EncodedData<const QByteArray &> QStringDecoder::decode(const QByteArray &ba)
+ \fn QStringDecoder::EncodedData<QByteArrayView> QStringDecoder::operator()(QByteArrayView ba)
+ \fn QStringDecoder::EncodedData<QByteArrayView> QStringDecoder::decode(QByteArrayView ba)
+
+ Converts \a ba and returns a struct that is implicitly convertible to QString.
+
+
+ \snippet code/src_corelib_text_qstringconverter.cpp 4
*/
/*!
@@ -2374,4 +2654,10 @@ const char *QStringConverter::nameForEncoding(QStringConverter::Encoding e)
\sa requiredSpace
*/
+/*!
+ \fn char16_t *QStringDecoder::appendToBuffer(char16_t *out, QByteArrayView in)
+ \since 6.6
+ \overload
+*/
+
QT_END_NAMESPACE