summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--src/corelib/text/qstringconverter.cpp172
-rw-r--r--tests/auto/corelib/codecs/qtextcodec/tst_qtextcodec.cpp8
-rw-r--r--tests/auto/corelib/text/qstringconverter/tst_qstringconverter.cpp8
3 files changed, 82 insertions, 106 deletions
diff --git a/src/corelib/text/qstringconverter.cpp b/src/corelib/text/qstringconverter.cpp
index f4a51ef93d..43ebde6449 100644
--- a/src/corelib/text/qstringconverter.cpp
+++ b/src/corelib/text/qstringconverter.cpp
@@ -416,6 +416,8 @@ char *QUtf8::convertFromUnicode(char *out, QStringView in, QStringConverter::Sta
Q_ASSERT(state);
const QChar *uc = in.data();
qsizetype len = in.length();
+ if (!len)
+ return out;
auto appendReplacementChar = [state](uchar *cursor) -> uchar * {
if (state->flags & QStringConverter::Flag::ConvertInvalidToNull) {
@@ -433,56 +435,50 @@ char *QUtf8::convertFromUnicode(char *out, QStringView in, QStringConverter::Sta
const ushort *src = reinterpret_cast<const ushort *>(uc);
const ushort *const end = src + len;
- int surrogate_high = -1;
- if (state->remainingChars) {
- surrogate_high = state->state_data[0];
- } else if (!(state->internalState & HeaderDone) && state->flags & QStringConverter::Flag::WriteBom) {
- // append UTF-8 BOM
- *cursor++ = utf8bom[0];
- *cursor++ = utf8bom[1];
- *cursor++ = utf8bom[2];
- state->internalState |= HeaderDone;
+ if (!(state->flags & QStringDecoder::Flag::Stateless)) {
+ if (state->remainingChars) {
+ int res = QUtf8Functions::toUtf8<QUtf8BaseTraits>(state->state_data[0], cursor, src, end);
+ if (res < 0)
+ cursor = appendReplacementChar(cursor);
+ state->state_data[0] = 0;
+ state->remainingChars = 0;
+ } else if (!(state->internalState & HeaderDone) && state->flags & QStringConverter::Flag::WriteBom) {
+ // append UTF-8 BOM
+ *cursor++ = utf8bom[0];
+ *cursor++ = utf8bom[1];
+ *cursor++ = utf8bom[2];
+ state->internalState |= HeaderDone;
+ }
}
- const ushort *nextAscii = src;
while (src != end) {
- int res;
- ushort uc;
- if (surrogate_high != -1) {
- uc = surrogate_high;
- surrogate_high = -1;
- res = QUtf8Functions::toUtf8<QUtf8BaseTraits>(uc, cursor, src, end);
- } else {
- if (src >= nextAscii && simdEncodeAscii(cursor, nextAscii, src, end))
- break;
+ const ushort *nextAscii = end;
+ if (simdEncodeAscii(cursor, nextAscii, src, end))
+ break;
- uc = *src++;
- res = QUtf8Functions::toUtf8<QUtf8BaseTraits>(uc, cursor, src, end);
- }
- if (Q_LIKELY(res >= 0))
- continue;
+ do {
+ ushort uc = *src++;
+ int res = QUtf8Functions::toUtf8<QUtf8BaseTraits>(uc, cursor, src, end);
+ if (Q_LIKELY(res >= 0))
+ continue;
- if (res == QUtf8BaseTraits::Error) {
- // encoding error
- ++state->invalidChars;
- cursor = appendReplacementChar(cursor);
- } else if (res == QUtf8BaseTraits::EndOfString) {
- surrogate_high = uc;
- break;
- }
+ if (res == QUtf8BaseTraits::Error) {
+ // encoding error
+ ++state->invalidChars;
+ cursor = appendReplacementChar(cursor);
+ } else if (res == QUtf8BaseTraits::EndOfString) {
+ if (state->flags & QStringConverter::Flag::Stateless) {
+ ++state->invalidChars;
+ cursor = appendReplacementChar(cursor);
+ } else {
+ state->remainingChars = 1;
+ state->state_data[0] = uc;
+ }
+ return reinterpret_cast<char *>(cursor);
+ }
+ } while (src < nextAscii);
}
- state->internalState |= HeaderDone;
- state->remainingChars = 0;
- if (surrogate_high >= 0) {
- if (state->flags & QStringConverter::Flag::Stateless) {
- ++state->invalidChars;
- cursor = appendReplacementChar(cursor);
- } else {
- state->remainingChars = 1;
- state->state_data[0] = surrogate_high;
- }
- }
return reinterpret_cast<char *>(cursor);
}
@@ -581,8 +577,9 @@ QString QUtf8::convertToUnicode(const char *chars, qsizetype len, QStringConvert
QChar *QUtf8::convertToUnicode(QChar *out, const char *chars, qsizetype len, QStringConverter::State *state)
{
Q_ASSERT(state);
+ if (!len)
+ return out;
- bool headerdone = state->internalState & HeaderDone || state->flags & QStringConverter::Flag::ConvertInitialBom;
ushort replacement = QChar::ReplacementCharacter;
if (state->flags & QStringConverter::Flag::ConvertInvalidToNull)
@@ -595,62 +592,60 @@ QChar *QUtf8::convertToUnicode(QChar *out, const char *chars, qsizetype len, QSt
const uchar *src = reinterpret_cast<const uchar *>(chars);
const uchar *end = src + len;
- if (state->remainingChars) {
- // handle incoming state first
- uchar remainingCharsData[4]; // longest UTF-8 sequence possible
- qsizetype remainingCharsCount = state->remainingChars;
- qsizetype newCharsToCopy = qMin<qsizetype>(sizeof(remainingCharsData) - remainingCharsCount, end - src);
-
- memset(remainingCharsData, 0, sizeof(remainingCharsData));
- memcpy(remainingCharsData, &state->state_data[0], remainingCharsCount);
- memcpy(remainingCharsData + remainingCharsCount, src, newCharsToCopy);
-
- const uchar *begin = &remainingCharsData[1];
- res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(remainingCharsData[0], dst, begin,
- static_cast<const uchar *>(remainingCharsData) + remainingCharsCount + newCharsToCopy);
- if (res == QUtf8BaseTraits::Error || (res == QUtf8BaseTraits::EndOfString && len == 0)) {
- // special case for len == 0:
- // if we were supplied an empty string, terminate the previous, unfinished sequence with error
- ++state->invalidChars;
- *dst++ = replacement;
- } else if (res == QUtf8BaseTraits::EndOfString) {
- // if we got EndOfString again, then there were too few bytes in src;
- // copy to our state and return
- state->remainingChars = remainingCharsCount + newCharsToCopy;
- memcpy(&state->state_data[0], remainingCharsData, state->remainingChars);
- return out;
- } else if (!headerdone && res >= 0) {
- // eat the UTF-8 BOM
- headerdone = true;
- if (dst[-1] == 0xfeff)
- --dst;
- }
+ if (!(state->flags & QStringConverter::Flag::Stateless)) {
+ bool headerdone = state->internalState & HeaderDone || state->flags & QStringConverter::Flag::ConvertInitialBom;
+ if (state->remainingChars || !headerdone) {
+ // handle incoming state first
+ uchar remainingCharsData[4]; // longest UTF-8 sequence possible
+ qsizetype remainingCharsCount = state->remainingChars;
+ qsizetype newCharsToCopy = qMin<qsizetype>(sizeof(remainingCharsData) - remainingCharsCount, end - src);
+
+ memset(remainingCharsData, 0, sizeof(remainingCharsData));
+ memcpy(remainingCharsData, &state->state_data[0], remainingCharsCount);
+ memcpy(remainingCharsData + remainingCharsCount, src, newCharsToCopy);
+
+ const uchar *begin = &remainingCharsData[1];
+ res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(remainingCharsData[0], dst, begin,
+ static_cast<const uchar *>(remainingCharsData) + remainingCharsCount + newCharsToCopy);
+ if (res == QUtf8BaseTraits::Error) {
+ ++state->invalidChars;
+ *dst++ = replacement;
+ ++src;
+ } else if (res == QUtf8BaseTraits::EndOfString) {
+ // if we got EndOfString again, then there were too few bytes in src;
+ // copy to our state and return
+ state->remainingChars = remainingCharsCount + newCharsToCopy;
+ memcpy(&state->state_data[0], remainingCharsData, state->remainingChars);
+ return out;
+ } else if (!headerdone) {
+ // eat the UTF-8 BOM
+ if (dst[-1] == 0xfeff)
+ --dst;
+ }
+ state->internalState |= HeaderDone;
- // adjust src now that we have maybe consumed a few chars
- if (res >= 0) {
- Q_ASSERT(res > remainingCharsCount);
- src += res - remainingCharsCount;
+ // adjust src now that we have maybe consumed a few chars
+ if (res >= 0) {
+ Q_ASSERT(res > remainingCharsCount);
+ src += res - remainingCharsCount;
+ }
}
+ } else if (!(state->flags & QStringConverter::Flag::ConvertInitialBom)) {
+ // stateless, remove initial BOM
+ if (len > 2 && src[0] == utf8bom[0] && src[1] == utf8bom[1] && src[2] == utf8bom[2])
+ // skip BOM
+ src += 3;
}
// main body, stateless decoding
res = 0;
const uchar *nextAscii = src;
- const uchar *start = src;
while (res >= 0 && src < end) {
if (src >= nextAscii && simdDecodeAscii(dst, nextAscii, src, end))
break;
ch = *src++;
res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(ch, dst, src, end);
- if (!headerdone && res >= 0) {
- headerdone = true;
- if (src == start + 3) { // 3 == sizeof(utf8-bom)
- // eat the UTF-8 BOM (it can only appear at the beginning of the string).
- if (dst[-1] == 0xfeff)
- --dst;
- }
- }
if (res == QUtf8BaseTraits::Error) {
res = 0;
++state->invalidChars;
@@ -677,9 +672,6 @@ QChar *QUtf8::convertToUnicode(QChar *out, const char *chars, qsizetype len, QSt
state->remainingChars = 0;
}
- if (headerdone)
- state->internalState |= HeaderDone;
-
return reinterpret_cast<QChar *>(dst);
}
diff --git a/tests/auto/corelib/codecs/qtextcodec/tst_qtextcodec.cpp b/tests/auto/corelib/codecs/qtextcodec/tst_qtextcodec.cpp
index 4dc75b11c0..08c2954484 100644
--- a/tests/auto/corelib/codecs/qtextcodec/tst_qtextcodec.cpp
+++ b/tests/auto/corelib/codecs/qtextcodec/tst_qtextcodec.cpp
@@ -1641,14 +1641,6 @@ void tst_QTextCodec::utf8stateful_data()
QTest::newRow("2of4+invalid") << QByteArray("\360\220") << QByteArray("a") << QString();
QTest::newRow("3of4+invalid") << QByteArray("\360\220\210") << QByteArray("a") << QString();
- // invalid: sequence too short (the empty second buffer causes a state reset)
- QTest::newRow("1of2+empty") << QByteArray("\xc2") << QByteArray() << QString();
- QTest::newRow("1of3+empty") << QByteArray("\xe0") << QByteArray() << QString();
- QTest::newRow("2of3+empty") << QByteArray("\xe0\xa0") << QByteArray() << QString();
- QTest::newRow("1of4+empty") << QByteArray("\360") << QByteArray() << QString();
- QTest::newRow("2of4+empty") << QByteArray("\360\220") << QByteArray() << QString();
- QTest::newRow("3of4+empty") << QByteArray("\360\220\210") << QByteArray() << QString();
-
// overlong sequence:
QTest::newRow("overlong-1of2") << QByteArray("\xc1") << QByteArray("\x81") << QString();
QTest::newRow("overlong-1of3") << QByteArray("\xe0") << QByteArray("\x81\x81") << QString();
diff --git a/tests/auto/corelib/text/qstringconverter/tst_qstringconverter.cpp b/tests/auto/corelib/text/qstringconverter/tst_qstringconverter.cpp
index e95bce6915..58ae28bbf4 100644
--- a/tests/auto/corelib/text/qstringconverter/tst_qstringconverter.cpp
+++ b/tests/auto/corelib/text/qstringconverter/tst_qstringconverter.cpp
@@ -1335,14 +1335,6 @@ void tst_QStringConverter::utf8stateful_data()
QTest::newRow("2of4+invalid") << QByteArray("\360\220") << QByteArray("a") << QString();
QTest::newRow("3of4+invalid") << QByteArray("\360\220\210") << QByteArray("a") << QString();
- // invalid: sequence too short (the empty second buffer causes a state reset)
- QTest::newRow("1of2+empty") << QByteArray("\xc2") << QByteArray() << QString();
- QTest::newRow("1of3+empty") << QByteArray("\xe0") << QByteArray() << QString();
- QTest::newRow("2of3+empty") << QByteArray("\xe0\xa0") << QByteArray() << QString();
- QTest::newRow("1of4+empty") << QByteArray("\360") << QByteArray() << QString();
- QTest::newRow("2of4+empty") << QByteArray("\360\220") << QByteArray() << QString();
- QTest::newRow("3of4+empty") << QByteArray("\360\220\210") << QByteArray() << QString();
-
// overlong sequence:
QTest::newRow("overlong-1of2") << QByteArray("\xc1") << QByteArray("\x81") << QString();
QTest::newRow("overlong-1of3") << QByteArray("\xe0") << QByteArray("\x81\x81") << QString();