From bab2cd1125a21885bea97079219031ab45861826 Mon Sep 17 00:00:00 2001 From: Thiago Macieira Date: Fri, 6 Nov 2020 23:48:49 -0800 Subject: QCborStreamReader: move the UTF-8 decoding into readStringChunk This allows us to decode long UTF-8 strings in chunks, instead of allocating a big block of the size of the UTF-8 source and then another for the full UTF-16 content. Pick-to: 5.15 6.0 Task-number: QTBUG-88253 Change-Id: I7b9b97ae9b32412abdc6fffd16452a47b1036ef3 Reviewed-by: Allan Sandfeld Jensen --- src/corelib/serialization/qcborstreamreader.cpp | 146 ++++++++++++++++++------ 1 file changed, 109 insertions(+), 37 deletions(-) diff --git a/src/corelib/serialization/qcborstreamreader.cpp b/src/corelib/serialization/qcborstreamreader.cpp index d097979bda..ef2ec08b88 100644 --- a/src/corelib/serialization/qcborstreamreader.cpp +++ b/src/corelib/serialization/qcborstreamreader.cpp @@ -48,6 +48,7 @@ #include #include #include +#include QT_BEGIN_NAMESPACE @@ -671,18 +672,23 @@ public: union { char *ptr; QByteArray *array; + QString *string; }; - enum { ByteArray = -1 }; + enum { ByteArray = -1, String = -3 }; qsizetype maxlen_or_type; ReadStringChunk(char *ptr, qsizetype maxlen) : ptr(ptr), maxlen_or_type(maxlen) {} ReadStringChunk(QByteArray *array) : array(array), maxlen_or_type(ByteArray) {} + ReadStringChunk(QString *str) : string(str), maxlen_or_type(String) {} + bool isString() const { return maxlen_or_type == String; } bool isByteArray() const { return maxlen_or_type == ByteArray; } bool isPlainPointer() const { return maxlen_or_type >= 0; } }; static QCborStreamReader::StringResultCode appendStringChunk(QCborStreamReader &reader, QByteArray *data); QCborStreamReader::StringResult readStringChunk(ReadStringChunk params); + qsizetype readStringChunk_byte(ReadStringChunk params, qsizetype len); + qsizetype readStringChunk_unicode(ReadStringChunk params, qsizetype utf8len); bool ensureStringIteration(); }; @@ -1354,29 +1360,17 @@ bool QCborStreamReader::leaveContainer() */ QCborStreamReader::StringResult QCborStreamReader::_readString_helper() { - auto r = _readByteArray_helper(); QCborStreamReader::StringResult result; + auto r = d->readStringChunk(&result.data); result.status = r.status; - - if (r.status == Ok) { - // See QUtf8::convertToUnicode() a detailed explanation of why this - // conversion uses the same number of words or less. - CborError err = CborNoError; - if (r.data.size() > MaxStringSize) { - err = CborErrorDataTooLarge; - } else { - QStringConverter::State cs(QStringConverter::Flag::Stateless); - result.data = QUtf8::convertToUnicode(r.data, &cs); - if (cs.invalidChars != 0 || cs.remainingChars != 0) - err = CborErrorInvalidUtf8TextString; - } - - if (err) { - d->handleError(err); - result.data.clear(); - result.status = Error; - } + if (r.status == Error) { + result.data.clear(); + } else { + Q_ASSERT(r.data == result.data.length()); + if (r.status == EndOfString && lastError() == QCborError::NoError) + preparse(); } + return result; } @@ -1547,12 +1541,41 @@ QCborStreamReaderPrivate::readStringChunk(ReadStringChunk params) return result; } - // Read the chunk into the user's buffer. - qint64 actuallyRead; qptrdiff offset = qptrdiff(content); + bufferStart += offset; + if (device) { + // This first skip can't fail because we've already read this many bytes. + device->skip(bufferStart); + } + + if (params.isString()) { + // readString() + result.data = readStringChunk_unicode(params, qsizetype(len)); + } else { + // readByteArray() or readStringChunk() + result.data = readStringChunk_byte(params, qsizetype(len)); + } + + if (result.data < 0) + return result; // error + + if (device) + updateBufferAfterString(0, len); + else + bufferStart += len; + + preread(); + result.status = QCborStreamReader::Ok; + return result; +} + +inline qsizetype +QCborStreamReaderPrivate::readStringChunk_byte(ReadStringChunk params, qsizetype len) +{ + qint64 actuallyRead; qsizetype toRead = qsizetype(len); qsizetype left = 0; // bytes from the chunk not copied to the user buffer, to discard - char *ptr; + char *ptr = nullptr; if (params.isPlainPointer()) { left = toRead - params.maxlen_or_type; @@ -1567,7 +1590,7 @@ QCborStreamReaderPrivate::readStringChunk(ReadStringChunk params) auto newSize = oldSize; if (add_overflow(oldSize, toRead, &newSize)) { handleError(CborErrorDataTooLarge); - return result; + return -1; } try { params.array->resize(newSize); @@ -1576,15 +1599,13 @@ QCborStreamReaderPrivate::readStringChunk(ReadStringChunk params) // compatibility with Qt 5; in Qt 6, we could consider everything // to be OOM. handleError(newSize > MaxByteArraySize ? CborErrorDataTooLarge: CborErrorOutOfMemory); - return result; + return -1; } ptr = const_cast(params.array->constData()) + oldSize; } if (device) { - // This first skip can't fail because we've already read this many bytes. - device->skip(bufferStart + qptrdiff(content)); actuallyRead = device->read(ptr, toRead); if (actuallyRead != toRead) { @@ -1597,20 +1618,71 @@ QCborStreamReaderPrivate::readStringChunk(ReadStringChunk params) if (actuallyRead < 0) { handleError(CborErrorIO); - return result; + return -1; } - - updateBufferAfterString(offset, len); } else { actuallyRead = toRead; - memcpy(ptr, buffer.constData() + bufferStart + offset, toRead); - bufferStart += QByteArray::size_type(offset + len); + memcpy(ptr, buffer.constData() + bufferStart, toRead); } - preread(); - result.data = actuallyRead; - result.status = QCborStreamReader::Ok; - return result; + return actuallyRead; +} + +inline qsizetype +QCborStreamReaderPrivate::readStringChunk_unicode(ReadStringChunk params, qsizetype utf8len) +{ + // See QUtf8::convertToUnicode() a detailed explanation of why this + // conversion uses the same number of words or less. + QChar *begin = nullptr; + if (params.isString()) { + try { + params.string->resize(utf8len); + } catch (const std::bad_alloc &) { + if (utf8len > MaxStringSize) + handleError(CborErrorDataTooLarge); + else + handleError(CborErrorOutOfMemory); + return -1; + } + + begin = const_cast(params.string->constData()); + } + + QChar *ptr = begin; + QStringConverter::State cs(QStringConverter::Flag::Stateless); + if (device == nullptr) { + // Easy case: we can decode straight from the buffer we already have + ptr = QUtf8::convertToUnicode(ptr, { buffer.constData() + bufferStart, utf8len }, &cs); + } else { + // read in chunks, to avoid creating large, intermediate buffers + constexpr qsizetype StringChunkSize = 16384; + qsizetype chunkSize = qMin(StringChunkSize, utf8len); + QVarLengthArray chunk(chunkSize); + + cs = { QStringConverter::Flag::ConvertInitialBom }; + while (utf8len > 0 && cs.invalidChars == 0) { + qsizetype toRead = qMin(chunkSize, utf8len); + qint64 actuallyRead = device->read(chunk.data(), toRead); + if (actuallyRead == toRead) + ptr = QUtf8::convertToUnicode(ptr, { chunk.data(), toRead }, &cs); + + if (actuallyRead != toRead) { + handleError(CborErrorIO); + return -1; + } + utf8len -= toRead; + } + } + + if (cs.invalidChars != 0 || cs.remainingChars != 0) { + handleError(CborErrorInvalidUtf8TextString); + return -1; + } + + qsizetype size = ptr - begin; + if (params.isString()) + params.string->truncate(size); + return size; } QT_END_NAMESPACE -- cgit v1.2.3