Add qbswap for a memory region

The compiler was generating some vectorized code for qresource.cpp but it wasn't very efficient. So improve upon it and make use in other places where we read UTF-16BE strings. [ChangeLog][QtCore] Added an overload of q{To,From}{Big,Little}Endian that operates on a memory region. Change-Id: I6a540578e810472bb455fffd1531fa2f1d724dfc Reviewed-by: Allan Sandfeld Jensen <allan.jensen@qt.io>
author: Thiago Macieira <thiago.macieira@intel.com> 2018-05-25 16:33:48 -0300
committer: Thiago Macieira <thiago.macieira@intel.com> 2018-07-04 03:04:40 +0000
commit: d0427759c67704fe0f1b04edadd4d30329af268c (patch)
tree: be555a72712a68658dd907fc600f03385df8e2cf /src/corelib/global/qendian.cpp
parent: c053f9d15467459477ccbbe156a096e3ac5e3a91 (diff)
1 files changed, 288 insertions, 9 deletions
diff --git a/src/corelib/global/qendian.cpp b/src/corelib/global/qendian.cpp
index 65df25a205..ec6bfec60e 100644
--- a/src/corelib/global/qendian.cpp
+++ b/src/corelib/global/qendian.cpp
@@ -1,11 +1,12 @@
 /****************************************************************************
 **
 ** Copyright (C) 2016 The Qt Company Ltd.
+** Copyright (C) 2018 Intel Corporation.
 ** Contact: https://www.qt.io/licensing/
 **
-** This file is part of the documentation of the Qt Toolkit.
+** This file is part of the QtCore module of the Qt Toolkit.
 **
-** $QT_BEGIN_LICENSE:FDL$
+** $QT_BEGIN_LICENSE:LGPL$
 ** Commercial License Usage
 ** Licensees holding valid commercial Qt licenses may use this file in
 ** accordance with the commercial license agreement provided with the
@@ -14,17 +15,36 @@
 ** and conditions see https://www.qt.io/terms-conditions. For further
 ** information use the contact form at https://www.qt.io/contact-us.
 **
-** GNU Free Documentation License Usage
-** Alternatively, this file may be used under the terms of the GNU Free
-** Documentation License version 1.3 as published by the Free Software
-** Foundation and appearing in the file included in the packaging of
-** this file. Please review the following information to ensure
-** the GNU Free Documentation License version 1.3 requirements
-** will be met: https://www.gnu.org/licenses/fdl-1.3.html.
+** GNU Lesser General Public License Usage
+** Alternatively, this file may be used under the terms of the GNU Lesser
+** General Public License version 3 as published by the Free Software
+** Foundation and appearing in the file LICENSE.LGPL3 included in the
+** packaging of this file. Please review the following information to
+** ensure the GNU Lesser General Public License version 3 requirements
+** will be met: https://www.gnu.org/licenses/lgpl-3.0.html.
+**
+** GNU General Public License Usage
+** Alternatively, this file may be used under the terms of the GNU
+** General Public License version 2.0 or (at your option) the GNU General
+** Public license version 3 or any later version approved by the KDE Free
+** Qt Foundation. The licenses are as published by the Free Software
+** Foundation and appearing in the file LICENSE.GPL2 and LICENSE.GPL3
+** included in the packaging of this file. Please review the following
+** information to ensure the GNU General Public License requirements will
+** be met: https://www.gnu.org/licenses/gpl-2.0.html and
+** https://www.gnu.org/licenses/gpl-3.0.html.
+**
 ** $QT_END_LICENSE$
 **
 ****************************************************************************/
 
+#include "qendian.h"
+
+#include "qalgorithms.h"
+#include <private/qsimd_p.h>
+
+QT_BEGIN_NAMESPACE
+
 /*!
     \headerfile <QtEndian>
     \title Endian Conversion Functions
@@ -90,6 +110,32 @@
     unmodified.
 */
 /*!
+    \fn template <typename T> T qFromBigEndian(const void *src, qsizetype count, void *dest)
+    \since 5.12
+    \relates <QtEndian>
+
+    Reads \a count big-endian numbers from memory location \a src and stores
+    them in the host byte order representation at \a dest. On CPU architectures
+    where the host byte order is little-endian (such as x86) this will swap the
+    byte order; otherwise it will just perform a \c memcpy from \a src to \a
+    dest.
+
+    \note Template type \c{T} can either be a quint16, qint16, quint32, qint32,
+    quint64, or qint64. Other types of integers, e.g., qlong, are not
+    applicable.
+
+    There are no data alignment constraints for \a src. However, \a dest is
+    expected to be naturally aligned for type \c{T}.
+
+    If \a src and \a dest can be the same pointer, this function will perform
+    an in-place swap (if necessary). If they are not the same, the memory
+    regions must not overlap.
+
+    \sa qFromLittleEndian()
+    \sa qToBigEndian()
+    \sa qToLittleEndian()
+*/
+/*!
     \fn template <typename T> T qFromLittleEndian(const void *src)
     \since 4.3
     \relates <QtEndian>
@@ -123,6 +169,32 @@
     unmodified.
 */
 /*!
+    \fn template <typename T> T qFromLittleEndian(const void *src, qsizetype count, void *dest)
+    \since 5.12
+    \relates <QtEndian>
+
+    Reads \a count little-endian numbers from memory location \a src and stores
+    them in the host byte order representation at \a dest. On CPU architectures
+    where the host byte order is big-endian (such as PowerPC) this will swap the
+    byte order; otherwise it will just perform a \c memcpy from \a src to \a
+    dest.
+
+    \note Template type \c{T} can either be a quint16, qint16, quint32, qint32,
+    quint64, or qint64. Other types of integers, e.g., qlong, are not
+    applicable.
+
+    There are no data alignment constraints for \a src. However, \a dest is
+    expected to be naturally aligned for type \c{T}.
+
+    If \a src and \a dest can be the same pointer, this function will perform
+    an in-place swap (if necessary). If they are not the same, the memory
+    regions must not overlap.
+
+    \sa qFromLittleEndian()
+    \sa qToBigEndian()
+    \sa qToLittleEndian()
+*/
+/*!
     \fn template <typename T> void qToBigEndian(T src, void *dest)
     \since 4.3
     \relates <QtEndian>
@@ -153,6 +225,32 @@
     unmodified.
 */
 /*!
+    \fn template <typename T> T qToBigEndian(const void *src, qsizetype count, void *dest)
+    \since 5.12
+    \relates <QtEndian>
+
+    Reads \a count numbers from memory location \a src in the host byte order
+    and stores them in big-endian representation at \a dest. On CPU
+    architectures where the host byte order is little-endian (such as x86) this
+    will swap the byte order; otherwise it will just perform a \c memcpy from
+    \a src to \a dest.
+
+    \note Template type \c{T} can either be a quint16, qint16, quint32, qint32,
+    quint64, or qint64. Other types of integers, e.g., qlong, are not
+    applicable.
+
+    There are no data alignment constraints for \a dest. However, \a src is
+    expected to be naturally aligned for type \c{T}.
+
+    If \a src and \a dest can be the same pointer, this function will perform
+    an in-place swap (if necessary). If they are not the same, the memory
+    regions must not overlap.
+
+    \sa qFromLittleEndian()
+    \sa qToBigEndian()
+    \sa qToLittleEndian()
+*/
+/*!
     \fn template <typename T> void qToLittleEndian(T src, void *dest)
     \since 4.3
     \relates <QtEndian>
@@ -182,6 +280,32 @@
     will return \a src with the byte order swapped; otherwise it will return \a src
     unmodified.
 */
+/*!
+    \fn template <typename T> T qToLittleEndian(const void *src, qsizetype count, void *dest)
+    \since 5.12
+    \relates <QtEndian>
+
+    Reads \a count numbers from memory location \a src in the host byte order
+    and stores them in little-endian representation at \a dest. On CPU
+    architectures where the host byte order is big-endian (such as PowerPC)
+    this will swap the byte order; otherwise it will just perform a \c memcpy
+    from \a src to \a dest.
+
+    \note Template type \c{T} can either be a quint16, qint16, quint32, qint32,
+    quint64, or qint64. Other types of integers, e.g., qlong, are not
+    applicable.
+
+    There are no data alignment constraints for \a dest. However, \a src is
+    expected to be naturally aligned for type \c{T}.
+
+    If \a src and \a dest can be the same pointer, this function will perform
+    an in-place swap (if necessary). If they are not the same, the memory
+    regions must not overlap.
+
+    \sa qFromLittleEndian()
+    \sa qToBigEndian()
+    \sa qToLittleEndian()
+*/
 
 /*!
     \class QLEInteger
@@ -552,3 +676,158 @@
 
     \sa qint64
 */
+
+#if defined(__SSSE3__)
+using ShuffleMask = uchar[16];
+Q_DECL_ALIGN(16) static const ShuffleMask shuffleMasks[3] = {
+    // 16-bit
+    {1, 0, 3, 2,  5, 4, 7, 6,  9, 8, 11, 10,  13, 12, 15, 14},
+    // 32-bit
+    {3, 2, 1, 0,  7, 6, 5, 4,  11, 10, 9, 8,  15, 14, 13, 12},
+    // 64-bit
+    {7, 6, 5, 4, 3, 2, 1, 0,   15, 14, 13, 12, 11, 10, 9, 8}
+};
+
+static size_t sseSwapLoop(const uchar *src, size_t bytes, uchar *dst,
+                          const __m128i *shuffleMaskPtr) noexcept
+{
+    size_t i = 0;
+    const __m128i shuffleMask = _mm_load_si128(shuffleMaskPtr);
+
+#  ifdef __AVX2__
+    const __m256i shuffleMask256 = _mm256_inserti128_si256(_mm256_castsi128_si256(shuffleMask), shuffleMask, 1);
+    for ( ; i + sizeof(__m256i) <= bytes; i += sizeof(__m256i)) {
+        __m256i data = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src + i));
+        data = _mm256_shuffle_epi8(data, shuffleMask256);
+        _mm256_storeu_si256(reinterpret_cast<__m256i *>(dst + i), data);
+    }
+#  else
+    for ( ; i + 2 * sizeof(__m128i) <= bytes; i += 2 * sizeof(__m128i)) {
+        __m128i data1 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src + i));
+        __m128i data2 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src + i) + 1);
+        data1 = _mm_shuffle_epi8(data1, shuffleMask);
+        data2 = _mm_shuffle_epi8(data2, shuffleMask);
+        _mm_storeu_si128(reinterpret_cast<__m128i *>(dst + i), data1);
+        _mm_storeu_si128(reinterpret_cast<__m128i *>(dst + i) + 1, data2);
+    }
+#  endif
+
+    if (i + sizeof(__m128i) <= bytes) {
+        __m128i data = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src + i));
+        data = _mm_shuffle_epi8(data, shuffleMask);
+        _mm_storeu_si128(reinterpret_cast<__m128i *>(dst + i), data);
+        i += sizeof(__m128i);
+    }
+
+    return i;
+}
+
+template <typename T> static Q_ALWAYS_INLINE
+size_t simdSwapLoop(const uchar *src, size_t bytes, uchar *dst) noexcept
+{
+    auto shuffleMaskPtr = reinterpret_cast<const __m128i *>(shuffleMasks[0]);
+    shuffleMaskPtr += qCountTrailingZeroBits(sizeof(T)) - 1;
+    size_t i = sseSwapLoop(src, bytes, dst, shuffleMaskPtr);
+
+    // epilogue
+    for (size_t _i = 0 ; i < bytes && _i < sizeof(__m128i); i += sizeof(T), _i += sizeof(T))
+        qbswap(qFromUnaligned<T>(src + i), dst + i);
+
+    // return the total, so the bswapLoop below does nothing
+    return bytes;
+}
+#elif defined(__SSE2__)
+template <typename T> static
+size_t simdSwapLoop(const uchar *, size_t, uchar *) noexcept
+{
+    // no generic version: we can't do 32- and 64-bit swaps easily,
+    // so we won't try
+    return 0;
+}
+
+template <> size_t simdSwapLoop<quint16>(const uchar *src, size_t bytes, uchar *dst) noexcept
+{
+    auto swapEndian = [](__m128i &data) {
+        __m128i lows = _mm_srli_epi16(data, 8);
+        __m128i highs = _mm_slli_epi16(data, 8);
+        data = _mm_xor_si128(lows, highs);
+    };
+
+    size_t i = 0;
+    for ( ; i + 2 * sizeof(__m128i) <= bytes; i += 2 * sizeof(__m128i)) {
+        __m128i data1 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src + i));
+        __m128i data2 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src + i) + 1);
+        swapEndian(data1);
+        swapEndian(data2);
+        _mm_storeu_si128(reinterpret_cast<__m128i *>(dst + i), data1);
+        _mm_storeu_si128(reinterpret_cast<__m128i *>(dst + i) + 1, data2);
+    }
+
+    if (i + sizeof(__m128i) <= bytes) {
+        __m128i data = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src + i));
+        swapEndian(data);
+        _mm_storeu_si128(reinterpret_cast<__m128i *>(dst + i), data);
+        i += sizeof(__m128i);
+    }
+
+    // epilogue
+    for (size_t _i = 0 ; i < bytes && _i < sizeof(__m128i); i += sizeof(quint16), _i += sizeof(quint16))
+        qbswap(qFromUnaligned<quint16>(src + i), dst + i);
+
+    // return the total, so the bswapLoop below does nothing
+    return bytes;
+}
+#else
+template <typename T> static Q_ALWAYS_INLINE
+size_t simdSwapLoop(const uchar *, size_t, uchar *) noexcept
+{
+    return 0;
+}
+#endif
+
+template <typename T> static Q_ALWAYS_INLINE
+void *bswapLoop(const uchar *src, size_t n, uchar *dst) noexcept
+{
+    // Buffers cannot partially overlap: either they're identical or totally
+    // disjoint (note: they can be adjacent).
+    if (src != dst) {
+        quintptr s = quintptr(src);
+        quintptr d = quintptr(dst);
+        if (s < d)
+            Q_ASSERT(s + n <= d);
+        else
+            Q_ASSERT(d + n <= s);
+    }
+
+    size_t i = simdSwapLoop<T>(src, n, dst);
+
+    for ( ; i < n; i += sizeof(T))
+        qbswap(qFromUnaligned<T>(src + i), dst + i);
+    return dst + i;
+}
+
+template <> void *qbswap<2>(const void *source, qsizetype n, void *dest) noexcept
+{
+    const uchar *src = reinterpret_cast<const uchar *>(source);
+    uchar *dst = reinterpret_cast<uchar *>(dest);
+
+    return bswapLoop<quint16>(src, n << 1, dst);
+}
+
+template <> void *qbswap<4>(const void *source, qsizetype n, void *dest) noexcept
+{
+    const uchar *src = reinterpret_cast<const uchar *>(source);
+    uchar *dst = reinterpret_cast<uchar *>(dest);
+
+    return bswapLoop<quint32>(src, n << 2, dst);
+}
+
+template <> void *qbswap<8>(const void *source, qsizetype n, void *dest) noexcept
+{
+    const uchar *src = reinterpret_cast<const uchar *>(source);
+    uchar *dst = reinterpret_cast<uchar *>(dest);
+
+    return bswapLoop<quint64>(src, n << 3, dst);
+}
+
+QT_END_NAMESPACE
author	Thiago Macieira <thiago.macieira@intel.com>	2018-05-25 16:33:48 -0300
committer	Thiago Macieira <thiago.macieira@intel.com>	2018-07-04 03:04:40 +0000
commit	d0427759c67704fe0f1b04edadd4d30329af268c (patch)
tree	be555a72712a68658dd907fc600f03385df8e2cf /src/corelib/global/qendian.cpp
parent	c053f9d15467459477ccbbe156a096e3ac5e3a91 (diff)