Let the compiler do the unaligned loads in QBitArray::count(bool)

For platforms where the CPU can do unaligned loads on its own, like x86, the compiler will generate actual loads. On other CPUs, it will do the byte-by-byte load like we were doing. The compiler cannot generate worse code than our hand-rolled load, so this change can only improve performance. Change-Id: I32a89e64aa64d8af504be6c5a10b04d7573cdb98 Reviewed-by: Olivier Goffart <ogoffart@woboq.com>
author: Thiago Macieira <thiago.macieira@intel.com> 2013-09-11 16:38:09 -0700
committer: The Qt Project <gerrit-noreply@qt-project.org> 2013-09-14 20:33:29 +0200
commit: 6f0fdaa76ca44e2e2a4f1ff4310a22493c93ea23 (patch)
tree: e1711e42bc076e9cda325d3ae2eb12e8981194b4 /src/corelib/tools/qbitarray.cpp
parent: 6c3a9df3fee0dc387bde3e16d185952ade4ea2fd (diff)
1 files changed, 21 insertions, 2 deletions
diff --git a/src/corelib/tools/qbitarray.cpp b/src/corelib/tools/qbitarray.cpp
index 42e29f641c..169f0ce2c8 100644
--- a/src/corelib/tools/qbitarray.cpp
+++ b/src/corelib/tools/qbitarray.cpp
@@ -161,6 +161,25 @@ QBitArray::QBitArray(int size, bool value)
     Same as size().
 */
 
+template <typename T> T qUnalignedLoad(const uchar *ptr)
+{
+    /*
+     * Testing with different compilers shows that they all optimize the memcpy
+     * call away and replace with direct loads whenever possible. On x86 and PPC,
+     * GCC does direct unaligned loads; on MIPS, it generates a pair of load-left
+     * and load-right instructions. ICC and Clang do the same on x86. This is both
+     * 32- and 64-bit.
+     *
+     * On ARM cores without unaligned loads, the compiler leaves a call to
+     * memcpy.
+     */
+
+    T u;
+    memcpy(&u, ptr, sizeof(u));
+    return u;
+}
+
+
 /*!
     If \a on is true, this function returns the number of
     1-bits stored in the bit array; otherwise the number
@@ -176,12 +195,12 @@ int QBitArray::count(bool on) const
     const quint8 *const end = reinterpret_cast<const quint8 *>(d.end());
 
     while (bits + 3 <= end) {
-        quint32 v = quint32(bits[0]) | (quint32(bits[1]) << 8) | (quint32(bits[2]) << 16) | (quint32(bits[3]) << 24);
+        quint32 v = qUnalignedLoad<quint32>(bits);
         bits += 4;
         numBits += int(qPopulationCount(v));
     }
     if (bits + 1 < end) {
-        quint16 v = quint16(bits[0]) | (quint16(bits[1]) << 8);
+        quint16 v = qUnalignedLoad<quint16>(bits);
         bits += 2;
         numBits += int(qPopulationCount(v));
     }
author	Thiago Macieira <thiago.macieira@intel.com>	2013-09-11 16:38:09 -0700
committer	The Qt Project <gerrit-noreply@qt-project.org>	2013-09-14 20:33:29 +0200
commit	6f0fdaa76ca44e2e2a4f1ff4310a22493c93ea23 (patch)
tree	e1711e42bc076e9cda325d3ae2eb12e8981194b4 /src/corelib/tools/qbitarray.cpp
parent	6c3a9df3fee0dc387bde3e16d185952ade4ea2fd (diff)