From be2635b8dd37dfa7cb3f1c41544b2736d13a058d Mon Sep 17 00:00:00 2001 From: Laszlo Agocs Date: Tue, 22 Sep 2020 19:54:30 +0200 Subject: rhi: Reuse the data in buffer ops in res.update batches Because having profilers bombarded with mallocs (due to creating deep copy QByteArrays) is not nice. Change-Id: I848f41f3465d6dc2a58a193cc863495aacf13d79 Reviewed-by: Andy Nichols --- src/gui/rhi/qrhi_p_p.h | 43 +++++++++++++++++++++++++++++++++++++++---- src/gui/rhi/qrhid3d11.cpp | 6 +++--- src/gui/rhi/qrhigles2.cpp | 10 +++++----- src/gui/rhi/qrhimetal.mm | 10 +++++----- src/gui/rhi/qrhinull.cpp | 2 +- src/gui/rhi/qrhivulkan.cpp | 14 +++++++------- 6 files changed, 60 insertions(+), 25 deletions(-) diff --git a/src/gui/rhi/qrhi_p_p.h b/src/gui/rhi/qrhi_p_p.h index a7d8a40bf9..13f1c88f2d 100644 --- a/src/gui/rhi/qrhi_p_p.h +++ b/src/gui/rhi/qrhi_p_p.h @@ -289,6 +289,7 @@ public: QRhiBuffer *buf; int offset; QByteArray data; + int dataSize; // the real number of currently used bytes in data, not the same as data.size() int readSize; QRhiBufferReadbackResult *result; @@ -298,7 +299,9 @@ public: op.type = DynamicUpdate; op.buf = buf; op.offset = offset; - op.data = QByteArray(reinterpret_cast(data), size ? size : buf->size()); + const int effectiveSize = size ? size : buf->size(); + op.data = QByteArray(reinterpret_cast(data), effectiveSize); + op.dataSize = effectiveSize; return op; } @@ -307,7 +310,29 @@ public: op->type = DynamicUpdate; op->buf = buf; op->offset = offset; - op->data = QByteArray(reinterpret_cast(data), size ? size : buf->size()); + const int effectiveSize = size ? size : buf->size(); + + // Why the isDetached check? Simply because the cost of detaching + // with a larger allocation may be a lot higher than creating a new + // deep copy bytearray with our (potentially lot smaller) data. + // This reduces the benefits with certain backends (e.g. Vulkan) + // that hold on to the data (implicit sharing!) of host visible + // buffers for the current and next frame (assuming 2 frames in + // flight), but it is still an improvement (enabled by + // nextResourceUpdateBatch's shuffling when choosing a free batch + // from the pool). For other backends (e.g. D3D11) this can reduce + // mallocs (caused by creating new deep copy bytearrays) almost + // completely after a few frames (assuming of course that no + // dynamic elements with larger buffer data appear). + + if (op->data.isDetached()) { + if (op->data.size() < effectiveSize) + op->data.resize(effectiveSize); + memcpy(op->data.data(), data, effectiveSize); + } else { + op->data = QByteArray(reinterpret_cast(data), effectiveSize); + } + op->dataSize = effectiveSize; } static BufferOp staticUpload(QRhiBuffer *buf, int offset, int size, const void *data) @@ -316,7 +341,9 @@ public: op.type = StaticUpload; op.buf = buf; op.offset = offset; - op.data = QByteArray(reinterpret_cast(data), size ? size : buf->size()); + const int effectiveSize = size ? size : buf->size(); + op.data = QByteArray(reinterpret_cast(data), effectiveSize); + op.dataSize = effectiveSize; return op; } @@ -325,7 +352,15 @@ public: op->type = StaticUpload; op->buf = buf; op->offset = offset; - op->data = QByteArray(reinterpret_cast(data), size ? size : buf->size()); + const int effectiveSize = size ? size : buf->size(); + if (op->data.isDetached()) { + if (op->data.size() < effectiveSize) + op->data.resize(effectiveSize); + memcpy(op->data.data(), data, effectiveSize); + } else { + op->data = QByteArray(reinterpret_cast(data), effectiveSize); + } + op->dataSize = effectiveSize; } static BufferOp read(QRhiBuffer *buf, int offset, int size, QRhiBufferReadbackResult *result) diff --git a/src/gui/rhi/qrhid3d11.cpp b/src/gui/rhi/qrhid3d11.cpp index 3b14b77ce1..e924966585 100644 --- a/src/gui/rhi/qrhid3d11.cpp +++ b/src/gui/rhi/qrhid3d11.cpp @@ -1418,12 +1418,12 @@ void QRhiD3D11::enqueueResourceUpdates(QRhiCommandBuffer *cb, QRhiResourceUpdate if (u.type == QRhiResourceUpdateBatchPrivate::BufferOp::DynamicUpdate) { QD3D11Buffer *bufD = QRHI_RES(QD3D11Buffer, u.buf); Q_ASSERT(bufD->m_type == QRhiBuffer::Dynamic); - memcpy(bufD->dynBuf.data() + u.offset, u.data.constData(), size_t(u.data.size())); + memcpy(bufD->dynBuf.data() + u.offset, u.data.constData(), size_t(u.dataSize)); bufD->hasPendingDynamicUpdates = true; } else if (u.type == QRhiResourceUpdateBatchPrivate::BufferOp::StaticUpload) { QD3D11Buffer *bufD = QRHI_RES(QD3D11Buffer, u.buf); Q_ASSERT(bufD->m_type != QRhiBuffer::Dynamic); - Q_ASSERT(u.offset + u.data.size() <= bufD->m_size); + Q_ASSERT(u.offset + u.dataSize <= bufD->m_size); QD3D11CommandBuffer::Command cmd; cmd.cmd = QD3D11CommandBuffer::Command::UpdateSubRes; cmd.args.updateSubRes.dst = bufD->buffer; @@ -1437,7 +1437,7 @@ void QRhiD3D11::enqueueResourceUpdates(QRhiCommandBuffer *cb, QRhiResourceUpdate box.left = UINT(u.offset); box.top = box.front = 0; box.back = box.bottom = 1; - box.right = UINT(u.offset + u.data.size()); // no -1: right, bottom, back are exclusive, see D3D11_BOX doc + box.right = UINT(u.offset + u.dataSize); // no -1: right, bottom, back are exclusive, see D3D11_BOX doc cmd.args.updateSubRes.hasDstBox = true; cmd.args.updateSubRes.dstBox = box; cbD->commands.append(cmd); diff --git a/src/gui/rhi/qrhigles2.cpp b/src/gui/rhi/qrhigles2.cpp index 190d506f9a..a65fcdb919 100644 --- a/src/gui/rhi/qrhigles2.cpp +++ b/src/gui/rhi/qrhigles2.cpp @@ -1687,7 +1687,7 @@ void QRhiGles2::enqueueResourceUpdates(QRhiCommandBuffer *cb, QRhiResourceUpdate QGles2Buffer *bufD = QRHI_RES(QGles2Buffer, u.buf); Q_ASSERT(bufD->m_type == QRhiBuffer::Dynamic); if (bufD->m_usage.testFlag(QRhiBuffer::UniformBuffer)) { - memcpy(bufD->ubuf.data() + u.offset, u.data.constData(), size_t(u.data.size())); + memcpy(bufD->ubuf.data() + u.offset, u.data.constData(), size_t(u.dataSize)); } else { trackedBufferBarrier(cbD, bufD, QGles2Buffer::AccessUpdate); QGles2CommandBuffer::Command cmd; @@ -1695,16 +1695,16 @@ void QRhiGles2::enqueueResourceUpdates(QRhiCommandBuffer *cb, QRhiResourceUpdate cmd.args.bufferSubData.target = bufD->targetForDataOps; cmd.args.bufferSubData.buffer = bufD->buffer; cmd.args.bufferSubData.offset = u.offset; - cmd.args.bufferSubData.size = u.data.size(); + cmd.args.bufferSubData.size = u.dataSize; cmd.args.bufferSubData.data = cbD->retainData(u.data); cbD->commands.append(cmd); } } else if (u.type == QRhiResourceUpdateBatchPrivate::BufferOp::StaticUpload) { QGles2Buffer *bufD = QRHI_RES(QGles2Buffer, u.buf); Q_ASSERT(bufD->m_type != QRhiBuffer::Dynamic); - Q_ASSERT(u.offset + u.data.size() <= bufD->m_size); + Q_ASSERT(u.offset + u.dataSize <= bufD->m_size); if (bufD->m_usage.testFlag(QRhiBuffer::UniformBuffer)) { - memcpy(bufD->ubuf.data() + u.offset, u.data.constData(), size_t(u.data.size())); + memcpy(bufD->ubuf.data() + u.offset, u.data.constData(), size_t(u.dataSize)); } else { trackedBufferBarrier(cbD, bufD, QGles2Buffer::AccessUpdate); QGles2CommandBuffer::Command cmd; @@ -1712,7 +1712,7 @@ void QRhiGles2::enqueueResourceUpdates(QRhiCommandBuffer *cb, QRhiResourceUpdate cmd.args.bufferSubData.target = bufD->targetForDataOps; cmd.args.bufferSubData.buffer = bufD->buffer; cmd.args.bufferSubData.offset = u.offset; - cmd.args.bufferSubData.size = u.data.size(); + cmd.args.bufferSubData.size = u.dataSize; cmd.args.bufferSubData.data = cbD->retainData(u.data); cbD->commands.append(cmd); } diff --git a/src/gui/rhi/qrhimetal.mm b/src/gui/rhi/qrhimetal.mm index 55adaa4d54..7bcb6864d6 100644 --- a/src/gui/rhi/qrhimetal.mm +++ b/src/gui/rhi/qrhimetal.mm @@ -1703,10 +1703,10 @@ void QRhiMetal::enqueueResourceUpdates(QRhiCommandBuffer *cb, QRhiResourceUpdate // basically the same. So go through the same pendingUpdates machinery. QMetalBuffer *bufD = QRHI_RES(QMetalBuffer, u.buf); Q_ASSERT(bufD->m_type != QRhiBuffer::Dynamic); - Q_ASSERT(u.offset + u.data.size() <= bufD->m_size); + Q_ASSERT(u.offset + u.dataSize <= bufD->m_size); for (int i = 0, ie = bufD->d->slotted ? QMTL_FRAMES_IN_FLIGHT : 1; i != ie; ++i) bufD->d->pendingUpdates[i].append( - QRhiResourceUpdateBatchPrivate::BufferOp::dynamicUpdate(u.buf, u.offset, u.data.size(), u.data.constData())); + QRhiResourceUpdateBatchPrivate::BufferOp::dynamicUpdate(u.buf, u.offset, u.dataSize, u.data.constData())); } else if (u.type == QRhiResourceUpdateBatchPrivate::BufferOp::Read) { QMetalBuffer *bufD = QRHI_RES(QMetalBuffer, u.buf); executeBufferHostWritesForCurrentFrame(bufD); @@ -1868,11 +1868,11 @@ void QRhiMetal::executeBufferHostWritesForSlot(QMetalBuffer *bufD, int slot) int changeEnd = -1; for (const QRhiResourceUpdateBatchPrivate::BufferOp &u : qAsConst(bufD->d->pendingUpdates[slot])) { Q_ASSERT(bufD == QRHI_RES(QMetalBuffer, u.buf)); - memcpy(static_cast(p) + u.offset, u.data.constData(), size_t(u.data.size())); + memcpy(static_cast(p) + u.offset, u.data.constData(), size_t(u.dataSize)); if (changeBegin == -1 || u.offset < changeBegin) changeBegin = u.offset; - if (changeEnd == -1 || u.offset + u.data.size() > changeEnd) - changeEnd = u.offset + u.data.size(); + if (changeEnd == -1 || u.offset + u.dataSize > changeEnd) + changeEnd = u.offset + u.dataSize; } #ifdef Q_OS_MACOS if (changeBegin >= 0 && bufD->d->managed) diff --git a/src/gui/rhi/qrhinull.cpp b/src/gui/rhi/qrhinull.cpp index 94d24c33a9..087559728d 100644 --- a/src/gui/rhi/qrhinull.cpp +++ b/src/gui/rhi/qrhinull.cpp @@ -465,7 +465,7 @@ void QRhiNull::resourceUpdate(QRhiCommandBuffer *cb, QRhiResourceUpdateBatch *re || u.type == QRhiResourceUpdateBatchPrivate::BufferOp::StaticUpload) { QNullBuffer *bufD = QRHI_RES(QNullBuffer, u.buf); - memcpy(bufD->data.data() + u.offset, u.data.constData(), size_t(u.data.size())); + memcpy(bufD->data.data() + u.offset, u.data.constData(), size_t(u.dataSize)); } else if (u.type == QRhiResourceUpdateBatchPrivate::BufferOp::Read) { QRhiBufferReadbackResult *result = u.result; result->data.resize(u.readSize); diff --git a/src/gui/rhi/qrhivulkan.cpp b/src/gui/rhi/qrhivulkan.cpp index 9ac8ef70ee..835db74804 100644 --- a/src/gui/rhi/qrhivulkan.cpp +++ b/src/gui/rhi/qrhivulkan.cpp @@ -2933,7 +2933,7 @@ void QRhiVulkan::enqueueResourceUpdates(QVkCommandBuffer *cbD, QRhiResourceUpdat } else if (u.type == QRhiResourceUpdateBatchPrivate::BufferOp::StaticUpload) { QVkBuffer *bufD = QRHI_RES(QVkBuffer, u.buf); Q_ASSERT(bufD->m_type != QRhiBuffer::Dynamic); - Q_ASSERT(u.offset + u.data.size() <= bufD->m_size); + Q_ASSERT(u.offset + u.dataSize <= bufD->m_size); if (!bufD->stagingBuffers[currentFrameSlot]) { VkBufferCreateInfo bufferInfo; @@ -2967,9 +2967,9 @@ void QRhiVulkan::enqueueResourceUpdates(QVkCommandBuffer *cbD, QRhiResourceUpdat qWarning("Failed to map buffer: %d", err); continue; } - memcpy(static_cast(p) + u.offset, u.data.constData(), size_t(u.data.size())); + memcpy(static_cast(p) + u.offset, u.data.constData(), size_t(u.dataSize)); vmaUnmapMemory(toVmaAllocator(allocator), a); - vmaFlushAllocation(toVmaAllocator(allocator), a, VkDeviceSize(u.offset), VkDeviceSize(u.data.size())); + vmaFlushAllocation(toVmaAllocator(allocator), a, VkDeviceSize(u.offset), VkDeviceSize(u.dataSize)); trackedBufferBarrier(cbD, bufD, 0, VK_ACCESS_TRANSFER_WRITE_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT); @@ -2978,7 +2978,7 @@ void QRhiVulkan::enqueueResourceUpdates(QVkCommandBuffer *cbD, QRhiResourceUpdat memset(©Info, 0, sizeof(copyInfo)); copyInfo.srcOffset = VkDeviceSize(u.offset); copyInfo.dstOffset = VkDeviceSize(u.offset); - copyInfo.size = VkDeviceSize(u.data.size()); + copyInfo.size = VkDeviceSize(u.dataSize); QVkCommandBuffer::Command cmd; cmd.cmd = QVkCommandBuffer::Command::CopyBuffer; @@ -3428,11 +3428,11 @@ void QRhiVulkan::executeBufferHostWritesForSlot(QVkBuffer *bufD, int slot) int changeEnd = -1; for (const QRhiResourceUpdateBatchPrivate::BufferOp &u : qAsConst(bufD->pendingDynamicUpdates[slot])) { Q_ASSERT(bufD == QRHI_RES(QVkBuffer, u.buf)); - memcpy(static_cast(p) + u.offset, u.data.constData(), size_t(u.data.size())); + memcpy(static_cast(p) + u.offset, u.data.constData(), size_t(u.dataSize)); if (changeBegin == -1 || u.offset < changeBegin) changeBegin = u.offset; - if (changeEnd == -1 || u.offset + u.data.size() > changeEnd) - changeEnd = u.offset + u.data.size(); + if (changeEnd == -1 || u.offset + u.dataSize > changeEnd) + changeEnd = u.offset + u.dataSize; } vmaUnmapMemory(toVmaAllocator(allocator), a); if (changeBegin >= 0) -- cgit v1.2.3