From 6f0fdaa76ca44e2e2a4f1ff4310a22493c93ea23 Mon Sep 17 00:00:00 2001 From: Thiago Macieira Date: Wed, 11 Sep 2013 16:38:09 -0700 Subject: Let the compiler do the unaligned loads in QBitArray::count(bool) For platforms where the CPU can do unaligned loads on its own, like x86, the compiler will generate actual loads. On other CPUs, it will do the byte-by-byte load like we were doing. The compiler cannot generate worse code than our hand-rolled load, so this change can only improve performance. Change-Id: I32a89e64aa64d8af504be6c5a10b04d7573cdb98 Reviewed-by: Olivier Goffart --- src/corelib/tools/qbitarray.cpp | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) (limited to 'src/corelib/tools/qbitarray.cpp') diff --git a/src/corelib/tools/qbitarray.cpp b/src/corelib/tools/qbitarray.cpp index 42e29f641c..169f0ce2c8 100644 --- a/src/corelib/tools/qbitarray.cpp +++ b/src/corelib/tools/qbitarray.cpp @@ -161,6 +161,25 @@ QBitArray::QBitArray(int size, bool value) Same as size(). */ +template T qUnalignedLoad(const uchar *ptr) +{ + /* + * Testing with different compilers shows that they all optimize the memcpy + * call away and replace with direct loads whenever possible. On x86 and PPC, + * GCC does direct unaligned loads; on MIPS, it generates a pair of load-left + * and load-right instructions. ICC and Clang do the same on x86. This is both + * 32- and 64-bit. + * + * On ARM cores without unaligned loads, the compiler leaves a call to + * memcpy. + */ + + T u; + memcpy(&u, ptr, sizeof(u)); + return u; +} + + /*! If \a on is true, this function returns the number of 1-bits stored in the bit array; otherwise the number @@ -176,12 +195,12 @@ int QBitArray::count(bool on) const const quint8 *const end = reinterpret_cast(d.end()); while (bits + 3 <= end) { - quint32 v = quint32(bits[0]) | (quint32(bits[1]) << 8) | (quint32(bits[2]) << 16) | (quint32(bits[3]) << 24); + quint32 v = qUnalignedLoad(bits); bits += 4; numBits += int(qPopulationCount(v)); } if (bits + 1 < end) { - quint16 v = quint16(bits[0]) | (quint16(bits[1]) << 8); + quint16 v = qUnalignedLoad(bits); bits += 2; numBits += int(qPopulationCount(v)); } -- cgit v1.2.3