diff options
Diffstat (limited to 'tests/benchmarks/corelib/tools/qstring/main.cpp')
-rw-r--r-- | tests/benchmarks/corelib/tools/qstring/main.cpp | 2601 |
1 files changed, 2601 insertions, 0 deletions
diff --git a/tests/benchmarks/corelib/tools/qstring/main.cpp b/tests/benchmarks/corelib/tools/qstring/main.cpp new file mode 100644 index 0000000000..96f2c30cf4 --- /dev/null +++ b/tests/benchmarks/corelib/tools/qstring/main.cpp @@ -0,0 +1,2601 @@ +/**************************************************************************** +** +** Copyright (C) 2011 Nokia Corporation and/or its subsidiary(-ies). +** All rights reserved. +** Contact: Nokia Corporation (qt-info@nokia.com) +** +** This file is part of the test suite of the Qt Toolkit. +** +** $QT_BEGIN_LICENSE:LGPL$ +** No Commercial Usage +** This file contains pre-release code and may not be distributed. +** You may use this file in accordance with the terms and conditions +** contained in the Technology Preview License Agreement accompanying +** this package. +** +** GNU Lesser General Public License Usage +** Alternatively, this file may be used under the terms of the GNU Lesser +** General Public License version 2.1 as published by the Free Software +** Foundation and appearing in the file LICENSE.LGPL included in the +** packaging of this file. Please review the following information to +** ensure the GNU Lesser General Public License version 2.1 requirements +** will be met: http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html. +** +** In addition, as a special exception, Nokia gives you certain additional +** rights. These rights are described in the Nokia Qt LGPL Exception +** version 1.1, included in the file LGPL_EXCEPTION.txt in this package. +** +** If you have questions regarding the use of this file, please contact +** Nokia at qt-info@nokia.com. +** +** +** +** +** +** +** +** +** $QT_END_LICENSE$ +** +****************************************************************************/ +#include <QStringList> +#include <QFile> +#include <QtTest/QtTest> + +#ifdef Q_OS_SYMBIAN +// In Symbian OS test data is located in applications private dir +// Application private dir is default serach path for files, so SRCDIR can be set to empty +#define SRCDIR "" +#endif + +#ifdef Q_OS_UNIX +#include <sys/mman.h> +#include <unistd.h> +#endif + +#include <private/qsimd_p.h> + +#include "data.h" + +class tst_QString: public QObject +{ + Q_OBJECT +public: + tst_QString(); +private slots: + void equals() const; + void equals_data() const; + void equals2_data() const; + void equals2() const; + void ucstrncmp_data() const; + void ucstrncmp() const; + void fromUtf8() const; + void fromLatin1_data() const; + void fromLatin1() const; + void fromLatin1Alternatives_data() const; + void fromLatin1Alternatives() const; + void fromUtf8Alternatives_data() const; + void fromUtf8Alternatives() const; +}; + +void tst_QString::equals() const +{ + QFETCH(QString, a); + QFETCH(QString, b); + + QBENCHMARK { + a == b; + } +} + +tst_QString::tst_QString() +{ +} + +void tst_QString::equals_data() const +{ + static const struct { + ushort data[80]; + int dummy; // just to ensure 4-byte alignment + } data = { + { + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, // 16 + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, // 32 + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, // 48 + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, // 64 + 64, 64, 64, 64, 96, 96, 96, 96, + 64, 64, 96, 96, 96, 96, 96, 96 // 80 + }, 0 + }; + const QChar *ptr = reinterpret_cast<const QChar *>(data.data); + + QTest::addColumn<QString>("a"); + QTest::addColumn<QString>("b"); + QString base = QString::fromRawData(ptr, 64); + + QTest::newRow("different-length") << base << QString::fromRawData(ptr, 4); + QTest::newRow("same-string") << base << base; + QTest::newRow("same-data") << base << QString::fromRawData(ptr, 64); + + // try to avoid crossing a cache line (that is, at ptr[64]) + QTest::newRow("aligned-aligned-4n") + << QString::fromRawData(ptr, 60) << QString::fromRawData(ptr + 2, 60); + QTest::newRow("aligned-unaligned-4n") + << QString::fromRawData(ptr, 60) << QString::fromRawData(ptr + 1, 60); + QTest::newRow("unaligned-unaligned-4n") + << QString::fromRawData(ptr + 1, 60) << QString::fromRawData(ptr + 3, 60); + + QTest::newRow("aligned-aligned-4n+1") + << QString::fromRawData(ptr, 61) << QString::fromRawData(ptr + 2, 61); + QTest::newRow("aligned-unaligned-4n+1") + << QString::fromRawData(ptr, 61) << QString::fromRawData(ptr + 1, 61); + QTest::newRow("unaligned-unaligned-4n+1") + << QString::fromRawData(ptr + 1, 61) << QString::fromRawData(ptr + 3, 61); + + QTest::newRow("aligned-aligned-4n-1") + << QString::fromRawData(ptr, 59) << QString::fromRawData(ptr + 2, 59); + QTest::newRow("aligned-unaligned-4n-1") + << QString::fromRawData(ptr, 59) << QString::fromRawData(ptr + 1, 59); + QTest::newRow("unaligned-unaligned-4n-1") + << QString::fromRawData(ptr + 1, 59) << QString::fromRawData(ptr + 3, 59); + + QTest::newRow("aligned-aligned-2n") + << QString::fromRawData(ptr, 58) << QString::fromRawData(ptr + 2, 58); + QTest::newRow("aligned-unaligned-2n") + << QString::fromRawData(ptr, 58) << QString::fromRawData(ptr + 1, 58); + QTest::newRow("unaligned-unaligned-2n") + << QString::fromRawData(ptr + 1, 58) << QString::fromRawData(ptr + 3, 58); +} + +static bool equals2_memcmp_call(const ushort *p1, const ushort *p2, int len) +{ + return memcmp(p1, p2, len * 2) == 0; +} + +static bool equals2_bytewise(const ushort *p1, const ushort *p2, int len) +{ + if (p1 == p2 || !len) + return true; + uchar *b1 = (uchar *)p1; + uchar *b2 = (uchar *)p2; + len *= 2; + while (len--) + if (*b1++ != *b2++) + return false; + return true; +} + +static bool equals2_shortwise(const ushort *p1, const ushort *p2, int len) +{ + if (p1 == p2 || !len) + return true; +// for (register int counter; counter < len; ++counter) +// if (p1[counter] != p2[counter]) +// return false; + while (len--) { + if (p1[len] != p2[len]) + return false; + } + return true; +} + +static bool equals2_intwise(const ushort *p1, const ushort *p2, int length) +{ + if (p1 == p2 || !length) + return true; + register union { + const quint16 *w; + const quint32 *d; + quintptr value; + } sa, sb; + sa.w = p1; + sb.w = p2; + + // check alignment + if ((sa.value & 2) == (sb.value & 2)) { + // both addresses have the same alignment + if (sa.value & 2) { + // both addresses are not aligned to 4-bytes boundaries + // compare the first character + if (*sa.w != *sb.w) + return false; + --length; + ++sa.w; + ++sb.w; + + // now both addresses are 4-bytes aligned + } + + // both addresses are 4-bytes aligned + // do a fast 32-bit comparison + register const quint32 *e = sa.d + (length >> 1); + for ( ; sa.d != e; ++sa.d, ++sb.d) { + if (*sa.d != *sb.d) + return false; + } + + // do we have a tail? + return (length & 1) ? *sa.w == *sb.w : true; + } else { + // one of the addresses isn't 4-byte aligned but the other is + register const quint16 *e = sa.w + length; + for ( ; sa.w != e; ++sa.w, ++sb.w) { + if (*sa.w != *sb.w) + return false; + } + } + return true; +} + +static inline bool equals2_short_tail(const ushort *p1, const ushort *p2, int len) +{ + if (len) { + if (*p1 != *p2) + return false; + if (--len) { + if (p1[1] != p2[1]) + return false; + if (--len) { + if (p1[2] != p2[2]) + return false; + if (--len) { + if (p1[3] != p2[3]) + return false; + if (--len) { + if (p1[4] != p2[4]) + return false; + if (--len) { + if (p1[5] != p2[5]) + return false; + if (--len) { + if (p1[6] != p2[6]) + return false; + return p1[7] == p2[7]; + } + } + } + } + } + } + } + return true; +} + +//#pragma GCC optimize("no-unroll-loops") +#ifdef __SSE2__ +static bool equals2_sse2_aligned(const ushort *p1, const ushort *p2, int len) +{ + if (len >= 8) { + qptrdiff counter = 0; + while (len > 8) { + __m128i q1 = _mm_load_si128((__m128i *)(p1 + counter)); + __m128i q2 = _mm_load_si128((__m128i *)(p2 + counter)); + __m128i cmp = _mm_cmpeq_epi16(q1, q2); + if (ushort(_mm_movemask_epi8(cmp)) != ushort(0xffff)) + return false; + + len -= 8; + counter += 8; + } + p1 += counter; + p2 += counter; + } + + return equals2_short_tail(p1, p2, len); +} + +static bool equals2_sse2(const ushort *p1, const ushort *p2, int len) +{ + if (p1 == p2 || !len) + return true; + + if (len >= 8) { + qptrdiff counter = 0; + while (len >= 8) { + __m128i q1 = _mm_loadu_si128((__m128i *)(p1 + counter)); + __m128i q2 = _mm_loadu_si128((__m128i *)(p2 + counter)); + __m128i cmp = _mm_cmpeq_epi16(q1, q2); + if (ushort(_mm_movemask_epi8(cmp)) != 0xffff) + return false; + + len -= 8; + counter += 8; + } + p1 += counter; + p2 += counter; + } + + return equals2_short_tail(p1, p2, len); +} + +//static bool equals2_sse2(const ushort *p1, const ushort *p2, int len) +//{ +// register int val1 = quintptr(p1) & 0xf; +// register int val2 = quintptr(p2) & 0xf; +// if (false && val1 + val2 == 0) +// return equals2_sse2_aligned(p1, p2, len); +// else +// return equals2_sse2_unaligned(p1, p2, len); +//} + +static bool equals2_sse2_aligning(const ushort *p1, const ushort *p2, int len) +{ + if (len < 8) + return equals2_short_tail(p1, p2, len); + + qptrdiff counter = 0; + + // which one is easier to align, p1 or p2 ? + register int val1 = quintptr(p1) & 0xf; + register int val2 = quintptr(p2) & 0xf; + if (val1 && val2) { +#if 0 + // we'll align the one which requires the least number of steps + if (val1 > val2) { + qSwap(p1, p2); + val1 = val2; + } + + // val1 contains the number of bytes past the 16-aligned mark + // we must read 16-val1 bytes to align + val1 = 16 - val1; + if (val1 & 0x2) { + if (*p1 != *p2) + return false; + --len; + ++counter; + } + while (val1 & 12) { + if (*(uint*)p1 != *(uint*)p2) + return false; + --len; + counter += 2; + val1 -= 4; + } +#else + // we'll align the one closest to the 16-byte mark + if (val1 > val2) { + qSwap(p1, p2); + val1 = val2; + } + + // we're reading val1 bytes too many + __m128i q2 = _mm_loadu_si128((__m128i *)(p2 - val1/2)); + __m128i cmp = _mm_cmpeq_epi16(*(__m128i *)(p1 - val1/2), q2); + if (short(_mm_movemask_epi8(cmp)) >> val1 != short(-1)) + return false; + + counter = 8 - val1/2; + len -= 8 - val1/2; +#endif + } else if (!val2) { + // p2 is already aligned + qSwap(p1, p2); + } + + // p1 is aligned + + while (len >= 8) { + __m128i q1 = _mm_load_si128((__m128i *)(p1 + counter)); + __m128i q2 = _mm_loadu_si128((__m128i *)(p2 + counter)); + __m128i cmp = _mm_cmpeq_epi16(q1, q2); + if (ushort(_mm_movemask_epi8(cmp)) != ushort(0xffff)) + return false; + + len -= 8; + counter += 8; + } + + // tail + return equals2_short_tail(p1 + counter, p2 + counter, len); +} + +#ifdef __SSE3__ +static bool equals2_sse3(const ushort *p1, const ushort *p2, int len) +{ + if (p1 == p2 || !len) + return true; + + if (len >= 8) { + qptrdiff counter = 0; + while (len >= 8) { + __m128i q1 = _mm_lddqu_si128((__m128i *)(p1 + counter)); + __m128i q2 = _mm_lddqu_si128((__m128i *)(p2 + counter)); + __m128i cmp = _mm_cmpeq_epi16(q1, q2); + if (ushort(_mm_movemask_epi8(cmp)) != 0xffff) + return false; + + len -= 8; + counter += 8; + } + p1 += counter; + p2 += counter; + } + + return equals2_short_tail(p1, p2, len); +} + +#ifdef __SSSE3__ +template<int N> static inline bool equals2_ssse3_alignr(__m128i *m1, __m128i *m2, int len) +{ + __m128i lower = _mm_load_si128(m1); + while (len >= 8) { + __m128i upper = _mm_load_si128(m1 + 1); + __m128i correct; + correct = _mm_alignr_epi8(upper, lower, N); + + __m128i q2 = _mm_lddqu_si128(m2); + __m128i cmp = _mm_cmpeq_epi16(correct, q2); + if (ushort(_mm_movemask_epi8(cmp)) != 0xffff) + return false; + + len -= 8; + ++m2; + ++m1; + lower = upper; + } + + // tail + return len == 0 || equals2_short_tail((const ushort *)m1 + N / 2, (const ushort*)m2, len); +} + +static inline bool equals2_ssse3_aligned(__m128i *m1, __m128i *m2, int len) +{ + while (len >= 8) { + __m128i q2 = _mm_lddqu_si128(m2); + __m128i cmp = _mm_cmpeq_epi16(*m1, q2); + if (ushort(_mm_movemask_epi8(cmp)) != 0xffff) + return false; + + len -= 8; + ++m1; + ++m2; + } + return len == 0 || equals2_short_tail((const ushort *)m1, (const ushort *)m2, len); +} + +static bool equals2_ssse3(const ushort *p1, const ushort *p2, int len) +{ + // p1 & 0xf can be: + // 0, 2, 4, 6, 8, 10, 12, 14 + // If it's 0, we're aligned + // If it's not, then we're interested in the 16 - (p1 & 0xf) bytes only + + if (len >= 8) { + // find the last aligned position below the p1 memory + __m128i *m1 = (__m128i *)(quintptr(p1) & ~0xf); + __m128i *m2 = (__m128i *)p2; + qptrdiff diff = quintptr(p1) - quintptr(m1); + + // diff contains the number of extra bytes + if (diff == 10) + return equals2_ssse3_alignr<10>(m1, m2, len); + else if (diff == 2) + return equals2_ssse3_alignr<2>(m1, m2, len); + if (diff < 8) { + if (diff < 4) { + return equals2_ssse3_aligned(m1, m2, len); + } else { + if (diff == 4) + return equals2_ssse3_alignr<4>(m1, m2, len); + else // diff == 6 + return equals2_ssse3_alignr<6>(m1, m2, len); + } + } else { + if (diff < 12) { + return equals2_ssse3_alignr<8>(m1, m2, len); + } else { + if (diff == 12) + return equals2_ssse3_alignr<12>(m1, m2, len); + else // diff == 14 + return equals2_ssse3_alignr<14>(m1, m2, len); + } + } + } + + // tail + return equals2_short_tail(p1, p2, len); +} + +template<int N> static inline bool equals2_ssse3_aligning_alignr(__m128i *m1, __m128i *m2, int len) +{ + __m128i lower = _mm_load_si128(m1); + while (len >= 8) { + __m128i upper = _mm_load_si128(m1 + 1); + __m128i correct; + correct = _mm_alignr_epi8(upper, lower, N); + + __m128i cmp = _mm_cmpeq_epi16(correct, *m2); + if (ushort(_mm_movemask_epi8(cmp)) != 0xffff) + return false; + + len -= 8; + ++m2; + ++m1; + lower = upper; + } + + // tail + return len == 0 || equals2_short_tail((const ushort *)m1 + N / 2, (const ushort*)m2, len); +} + +static bool equals2_ssse3_aligning(const ushort *p1, const ushort *p2, int len) +{ + if (len < 8) + return equals2_short_tail(p1, p2, len); + qptrdiff counter = 0; + + // which one is easier to align, p1 or p2 ? + { + register int val1 = quintptr(p1) & 0xf; + register int val2 = quintptr(p2) & 0xf; + if (val1 && val2) { + // we'll align the one closest to the 16-byte mark + if (val1 < val2) { + qSwap(p1, p2); + val2 = val1; + } + + // we're reading val1 bytes too many + __m128i q1 = _mm_lddqu_si128((__m128i *)(p1 - val2/2)); + __m128i cmp = _mm_cmpeq_epi16(q1, *(__m128i *)(p2 - val2/2)); + if (short(_mm_movemask_epi8(cmp)) >> val1 != short(-1)) + return false; + + counter = 8 - val2/2; + len -= 8 - val2/2; + } else if (!val1) { + // p1 is already aligned + qSwap(p1, p2); + } + } + + // p2 is aligned now + // we want to use palignr in the mis-alignment of p1 + __m128i *m1 = (__m128i *)(quintptr(p1 + counter) & ~0xf); + __m128i *m2 = (__m128i *)(p2 + counter); + register int val1 = quintptr(p1 + counter) - quintptr(m1); + + // val1 contains the number of extra bytes + if (val1 == 8) + return equals2_ssse3_aligning_alignr<8>(m1, m2, len); + if (val1 == 0) + return equals2_sse2_aligned(p1 + counter, p2 + counter, len); + if (val1 < 8) { + if (val1 < 4) { + return equals2_ssse3_aligning_alignr<2>(m1, m2, len); + } else { + if (val1 == 4) + return equals2_ssse3_aligning_alignr<4>(m1, m2, len); + else // diff == 6 + return equals2_ssse3_aligning_alignr<6>(m1, m2, len); + } + } else { + if (val1 < 12) { + return equals2_ssse3_aligning_alignr<10>(m1, m2, len); + } else { + if (val1 == 12) + return equals2_ssse3_aligning_alignr<12>(m1, m2, len); + else // diff == 14 + return equals2_ssse3_aligning_alignr<14>(m1, m2, len); + } + } +} + +#ifdef __SSE4_1__ +static bool equals2_sse4(const ushort *p1, const ushort *p2, int len) +{ + // We use the pcmpestrm instruction searching for differences (negative polarity) + // it will reset CF if it's all equal + // it will reset OF if the first char is equal + // it will set ZF & SF if the length is less than 8 (which means we've done the last operation) + // the three possible conditions are: + // difference found: CF = 1 + // all equal, not finished: CF = ZF = SF = 0 + // all equal, finished: CF = 0, ZF = SF = 1 + // We use the JA instruction that jumps if ZF = 0 and CF = 0 + if (p1 == p2 || !len) + return true; + + // This function may read some bytes past the end of p1 or p2 + // It is safe to do that, as long as those extra bytes (beyond p1+len and p2+len) + // are on the same page as the last valid byte. + // If len is a multiple of 8, we'll never load invalid bytes. + if (len & 7) { + // The last load would load (len & ~7) valid bytes and (8 - (len & ~7)) invalid bytes. + // So we can't do the last load if any of those bytes is in a different + // page. That is, if: + // pX + len is on a different page from pX + (len & ~7) + 8 + // + // that is, if second-to-last load ended up less than 16 bytes from the page end: + // pX + (len & ~7) is the last ushort read in the second-to-last load + if (len < 8) + return equals2_short_tail(p1, p2, len); + if ((quintptr(p1 + (len & ~7)) & 0xfff) > 0xff0 || + (quintptr(p2 + (len & ~7)) & 0xfff) > 0xff0) { + + // yes, so we mustn't do the final 128-bit load + bool result; + asm ( + "sub %[p1], %[p2]\n\t" + "sub $16, %[p1]\n\t" + "add $8, %[len]\n\t" + + // main loop: + "0:\n\t" + "add $16, %[p1]\n\t" + "sub $8, %[len]\n\t" + "jz 1f\n\t" + "lddqu (%[p1]), %%xmm0\n\t" + "mov %[len], %%edx\n\t" + "pcmpestri %[mode], (%[p2],%[p1]), %%xmm0\n\t" + + "jna 1f\n\t" + "add $16, %[p1]\n\t" + "sub $8, %[len]\n\t" + "jz 1f\n\t" + "lddqu (%[p1]), %%xmm0\n\t" + "mov %[len], %%edx\n\t" + "pcmpestri %[mode], (%[p2],%[p1]), %%xmm0\n\t" + + "ja 0b\n\t" + "1:\n\t" + "setnc %[result]\n\t" + : [result] "=a" (result), + [p1] "+r" (p1), + [p2] "+r" (p2) + : [len] "0" (len & ~7), + [mode] "i" (_SIDD_UWORD_OPS | _SIDD_CMP_EQUAL_EACH | _SIDD_NEGATIVE_POLARITY) + : "%edx", "%ecx", "%xmm0" + ); + return result && equals2_short_tail(p1, (const ushort *)(quintptr(p1) + quintptr(p2)), len & 7); + } + } + +// const qptrdiff disp = p2 - p1; +// p1 -= 8; +// len += 8; +// while (true) { +// enum { Mode = _SIDD_UWORD_OPS | _SIDD_CMP_EQUAL_EACH | _SIDD_NEGATIVE_POLARITY }; + +// p1 += 8; +// len -= 8; +// if (!len) +// return true; + +// __m128i q1 = _mm_lddqu_si128((__m128i *)(p1 + disp)); +// __m128i *m2 = (__m128i *)p1; + +// bool cmp_a = _mm_cmpestra(q1, len, *m2, len, Mode); +// if (cmp_a) +// continue; +// return !_mm_cmpestrc(q1, len, *m2, len, Mode); +// } +// return true; + bool result; + asm ( + "sub %[p1], %[p2]\n\t" + "sub $16, %[p1]\n\t" + "add $8, %[len]\n\t" + + "0:\n\t" + "add $16, %[p1]\n\t" + "sub $8, %[len]\n\t" + "jz 1f\n\t" + "lddqu (%[p2],%[p1]), %%xmm0\n\t" + "mov %[len], %%edx\n\t" + "pcmpestri %[mode], (%[p1]), %%xmm0\n\t" + + "jna 1f\n\t" + "add $16, %[p1]\n\t" + "sub $8, %[len]\n\t" + "jz 1f\n\t" + "lddqu (%[p2],%[p1]), %%xmm0\n\t" + "mov %[len], %%edx\n\t" + "pcmpestri %[mode], (%[p1]), %%xmm0\n\t" + + "ja 0b\n\t" + + "1:\n\t" + "setnc %[result]\n\t" + : [result] "=a" (result) + : [len] "0" (len), + [p1] "r" (p1), + [p2] "r" (p2), + [mode] "i" (_SIDD_UWORD_OPS | _SIDD_CMP_EQUAL_EACH | _SIDD_NEGATIVE_POLARITY) + : "%edx", "%ecx", "%xmm0" + ); + return result; +} + +#endif +#endif +#endif +#endif + +typedef bool (* FuncPtr)(const ushort *, const ushort *, int); +static const FuncPtr func[] = { + equals2_memcmp_call, // 0 + equals2_bytewise, // 1 + equals2_shortwise, // 1 + equals2_intwise, // 3 +#ifdef __SSE2__ + equals2_sse2, // 4 + equals2_sse2_aligning, // 5 +#ifdef __SSE3__ + equals2_sse3, // 6 +#ifdef __SSSE3__ + equals2_ssse3, // 7 + equals2_ssse3, // 8 +#ifdef __SSE4_1__ + equals2_sse4, // 9 +#endif +#endif +#endif +#endif + 0 +}; +static const int functionCount = sizeof(func)/sizeof(func[0]) - 1; + +void tst_QString::equals2_data() const +{ + QTest::addColumn<int>("algorithm"); + QTest::newRow("selftest") << -1; + QTest::newRow("memcmp_call") << 0; + QTest::newRow("bytewise") << 1; + QTest::newRow("shortwise") << 2; + QTest::newRow("intwise") << 3; +#ifdef __SSE2__ + QTest::newRow("sse2") << 4; + QTest::newRow("sse2_aligning") << 5; +#ifdef __SSE3__ + QTest::newRow("sse3") << 6; +#ifdef __SSSE3__ + QTest::newRow("ssse3") << 7; + QTest::newRow("ssse3_aligning") << 8; +#ifdef __SSE4_1__ + QTest::newRow("sse4.2") << 9; +#endif +#endif +#endif +#endif +} + +static void __attribute__((noinline)) equals2_selftest() +{ +#ifdef Q_OS_UNIX + const long pagesize = sysconf(_SC_PAGESIZE); + void *page1, *page3; + ushort *page2; + page1 = mmap(0, pagesize, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + page2 = (ushort *)mmap(0, pagesize, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS | MAP_POPULATE, -1, 0); + page3 = mmap(0, pagesize, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + + Q_ASSERT(quintptr(page2) == quintptr(page1) + pagesize || quintptr(page2) == quintptr(page1) - pagesize); + Q_ASSERT(quintptr(page3) == quintptr(page2) + pagesize || quintptr(page3) == quintptr(page2) - pagesize); + munmap(page1, pagesize); + munmap(page3, pagesize); + + // populate our page + for (uint i = 0; i < pagesize / sizeof(long long); ++i) + ((long long *)page2)[i] = Q_INT64_C(0x0041004100410041); + + // the following should crash: + //page2[-1] = 0xdead; + //page2[pagesize / sizeof(ushort) + 1] = 0xbeef; + + static const ushort needle[] = { + 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, + 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, + 0x41 + }; + + for (int algo = 0; algo < functionCount; ++algo) { + // boundary condition test: + for (int i = 0; i < 8; ++i) { + (func[algo])(page2 + i, needle, sizeof needle / 2); + (func[algo])(page2 - i - 1 - sizeof(needle)/2 + pagesize/2, needle, sizeof needle/2); + } + } + + munmap(page2, pagesize); +#endif + + for (int algo = 0; algo < functionCount; ++algo) { + for (int i = 0; i < stringCollectionCount; ++i) { + const ushort *p1 = stringCollectionData + stringCollection[i].offset1; + const ushort *p2 = stringCollectionData + stringCollection[i].offset2; + bool expected = memcmp(p1, p2, stringCollection[i].len * 2) == 0; + + bool result = (func[algo])(p1, p2, stringCollection[i].len); + if (expected != result) + qWarning().nospace() + << "algo=" << algo + << " i=" << i + << " failed (" << result << "!=" << expected + << "); strings were " + << QByteArray((char*)p1, stringCollection[i].len).toHex() + << " and " + << QByteArray((char*)p2, stringCollection[i].len).toHex(); + } + } +} + +void tst_QString::equals2() const +{ + QFETCH(int, algorithm); + if (algorithm == -1) { + equals2_selftest(); + return; + } + + QBENCHMARK { + for (int i = 0; i < stringCollectionCount; ++i) { + const ushort *p1 = stringCollectionData + stringCollection[i].offset1; + const ushort *p2 = stringCollectionData + stringCollection[i].offset2; + bool result = (func[algorithm])(p1, p2, stringCollection[i].len); + Q_UNUSED(result); + } + } +} + +static int ucstrncmp_shortwise(const ushort *a, const ushort *b, int l) +{ + while (l-- && *a == *b) + a++,b++; + if (l==-1) + return 0; + return *a - *b; +} + +static int ucstrncmp_intwise(const ushort *a, const ushort *b, int len) +{ + // do both strings have the same alignment? + if ((quintptr(a) & 2) == (quintptr(b) & 2)) { + // are we aligned to 4 bytes? + if (quintptr(a) & 2) { + if (*a != *b) + return *a - *b; + ++a; + ++b; + --len; + } + + const uint *p1 = (const uint *)a; + const uint *p2 = (const uint *)b; + quintptr counter = 0; + for ( ; len > 1 ; len -= 2, ++counter) { + if (p1[counter] != p2[counter]) { + // which ushort isn't equal? + int diff = a[2*counter] - b[2*counter]; + return diff ? diff : a[2*counter + 1] - b[2*counter + 1]; + } + } + + return len ? a[2*counter] - b[2*counter] : 0; + } else { + while (len-- && *a == *b) + a++,b++; + if (len==-1) + return 0; + return *a - *b; + } +} + +#ifdef __SSE2__ +static inline int ucstrncmp_short_tail(const ushort *p1, const ushort *p2, int len) +{ + if (len) { + if (*p1 != *p2) + return *p1 - *p2; + if (--len) { + if (p1[1] != p2[1]) + return p1[1] - p2[1]; + if (--len) { + if (p1[2] != p2[2]) + return p1[2] - p2[2]; + if (--len) { + if (p1[3] != p2[3]) + return p1[3] - p2[3]; + if (--len) { + if (p1[4] != p2[4]) + return p1[4] - p2[4]; + if (--len) { + if (p1[5] != p2[5]) + return p1[5] - p2[5]; + if (--len) { + if (p1[6] != p2[6]) + return p1[6] - p2[6]; + return p1[7] - p2[7]; + } + } + } + } + } + } + } + return 0; +} + +static inline int bsf_nonzero(register long val) +{ + int result; +# ifdef Q_CC_GNU + // returns the first non-zero bit on a non-zero reg + asm ("bsf %1, %0" : "=r" (result) : "r" (val)); + return result; +# elif defined(Q_CC_MSVC) + _BitScanForward(&result, val); + return result; +# endif +} + +static int ucstrncmp_sse2(const ushort *a, const ushort *b, int len) +{ + qptrdiff counter = 0; + while (len >= 8) { + __m128i m1 = _mm_loadu_si128((__m128i *)(a + counter)); + __m128i m2 = _mm_loadu_si128((__m128i *)(b + counter)); + __m128i cmp = _mm_cmpeq_epi16(m1, m2); + ushort mask = ~uint(_mm_movemask_epi8(cmp)); + if (mask) { + // which ushort isn't equal? + counter += bsf_nonzero(mask)/2; + return a[counter] - b[counter]; + } + + counter += 8; + len -= 8; + } + return ucstrncmp_short_tail(a + counter, b + counter, len); +} + +static int ucstrncmp_sse2_aligning(const ushort *a, const ushort *b, int len) +{ + if (len >= 8) { + __m128i m1 = _mm_loadu_si128((__m128i *)a); + __m128i m2 = _mm_loadu_si128((__m128i *)b); + __m128i cmp = _mm_cmpeq_epi16(m1, m2); + ushort mask = ~uint(_mm_movemask_epi8(cmp)); + if (mask) { + // which ushort isn't equal? + int counter = bsf_nonzero(mask)/2; + return a[counter] - b[counter]; + } + + + // now align to do 16-byte loads + int diff = 8 - (quintptr(a) & 0xf)/2; + len -= diff; + a += diff; + b += diff; + } + + qptrdiff counter = 0; + while (len >= 8) { + __m128i m1 = _mm_load_si128((__m128i *)(a + counter)); + __m128i m2 = _mm_loadu_si128((__m128i *)(b + counter)); + __m128i cmp = _mm_cmpeq_epi16(m1, m2); + ushort mask = ~uint(_mm_movemask_epi8(cmp)); + if (mask) { + // which ushort isn't equal? + counter += bsf_nonzero(mask)/2; + return a[counter] - b[counter]; + } + + counter += 8; + len -= 8; + } + return ucstrncmp_short_tail(a + counter, b + counter, len); +} + +static inline int ucstrncmp_sse2_aligned(const ushort *a, const ushort *b, int len) +{ + quintptr counter = 0; + while (len >= 8) { + __m128i m1 = _mm_load_si128((__m128i *)(a + counter)); + __m128i m2 = _mm_load_si128((__m128i *)(b + counter)); + __m128i cmp = _mm_cmpeq_epi16(m1, m2); + ushort mask = ~uint(_mm_movemask_epi8(cmp)); + if (mask) { + // which ushort isn't equal? + counter += bsf_nonzero(mask)/2; + return a[counter] - b[counter]; + } + + counter += 8; + len -= 8; + } + return ucstrncmp_short_tail(a + counter, b + counter, len); +} + +#ifdef __SSSE3__ +static inline int ucstrncmp_ssse3_alignr_aligned(const ushort *a, const ushort *b, int len) +{ + quintptr counter = 0; + while (len >= 8) { + __m128i m1 = _mm_load_si128((__m128i *)(a + counter)); + __m128i m2 = _mm_lddqu_si128((__m128i *)(b + counter)); + __m128i cmp = _mm_cmpeq_epi16(m1, m2); + ushort mask = ~uint(_mm_movemask_epi8(cmp)); + if (mask) { + // which ushort isn't equal? + counter += bsf_nonzero(mask)/2; + return a[counter] - b[counter]; + } + + counter += 8; + len -= 8; + } + return ucstrncmp_short_tail(a + counter, b + counter, len); +} + + +typedef __m128i (* MMLoadFunction)(const __m128i *); +template<int N, MMLoadFunction LoadFunction> +static inline int ucstrncmp_ssse3_alignr(const ushort *a, const ushort *b, int len) +{ + qptrdiff counter = 0; + __m128i lower, upper; + upper = _mm_load_si128((__m128i *)a); + + do { + lower = upper; + upper = _mm_load_si128((__m128i *)(a + counter) + 1); + __m128i merged = _mm_alignr_epi8(upper, lower, N); + + __m128i m2 = LoadFunction((__m128i *)(b + counter)); + __m128i cmp = _mm_cmpeq_epi16(merged, m2); + ushort mask = ~uint(_mm_movemask_epi8(cmp)); + if (mask) { + // which ushort isn't equal? + counter += bsf_nonzero(mask)/2; + return a[counter + N/2] - b[counter]; + } + + counter += 8; + len -= 8; + } while (len >= 8); + + return ucstrncmp_short_tail(a + counter + N/2, b + counter, len); +} + +static int ucstrncmp_ssse3(const ushort *a, const ushort *b, int len) +{ + if (len >= 8) { + int val = quintptr(a) & 0xf; + a -= val/2; + + if (val == 10) + return ucstrncmp_ssse3_alignr<10, _mm_lddqu_si128>(a, b, len); + else if (val == 2) + return ucstrncmp_ssse3_alignr<2, _mm_lddqu_si128>(a, b, len); + if (val < 8) { + if (val < 4) + return ucstrncmp_ssse3_alignr_aligned(a, b, len); + else if (val == 4) + return ucstrncmp_ssse3_alignr<4, _mm_lddqu_si128>(a, b, len); + else + return ucstrncmp_ssse3_alignr<6, _mm_lddqu_si128>(a, b, len); + } else { + if (val < 12) + return ucstrncmp_ssse3_alignr<8, _mm_lddqu_si128>(a, b, len); + else if (val == 12) + return ucstrncmp_ssse3_alignr<12, _mm_lddqu_si128>(a, b, len); + else + return ucstrncmp_ssse3_alignr<14, _mm_lddqu_si128>(a, b, len); + } + } + return ucstrncmp_short_tail(a, b, len); +} + +static int ucstrncmp_ssse3_aligning(const ushort *a, const ushort *b, int len) +{ + if (len >= 8) { + __m128i m1 = _mm_loadu_si128((__m128i *)a); + __m128i m2 = _mm_loadu_si128((__m128i *)b); + __m128i cmp = _mm_cmpeq_epi16(m1, m2); + ushort mask = ~uint(_mm_movemask_epi8(cmp)); + if (mask) { + // which ushort isn't equal? + int counter = bsf_nonzero(mask)/2; + return a[counter] - b[counter]; + } + + + // now 'b' align to do 16-byte loads + int diff = 8 - (quintptr(b) & 0xf)/2; + len -= diff; + a += diff; + b += diff; + } + + if (len < 8) + return ucstrncmp_short_tail(a, b, len); + + // 'b' is aligned + int val = quintptr(a) & 0xf; + a -= val/2; + + if (val == 8) + return ucstrncmp_ssse3_alignr<8, _mm_load_si128>(a, b, len); + else if (val == 0) + return ucstrncmp_sse2_aligned(a, b, len); + if (val < 8) { + if (val < 4) + return ucstrncmp_ssse3_alignr<2, _mm_load_si128>(a, b, len); + else if (val == 4) + return ucstrncmp_ssse3_alignr<4, _mm_load_si128>(a, b, len); + else + return ucstrncmp_ssse3_alignr<6, _mm_load_si128>(a, b, len); + } else { + if (val < 12) + return ucstrncmp_ssse3_alignr<10, _mm_load_si128>(a, b, len); + else if (val == 12) + return ucstrncmp_ssse3_alignr<12, _mm_load_si128>(a, b, len); + else + return ucstrncmp_ssse3_alignr<14, _mm_load_si128>(a, b, len); + } +} + +static inline +int ucstrncmp_ssse3_aligning2_aligned(const ushort *a, const ushort *b, int len, int garbage) +{ + // len >= 8 + __m128i m1 = _mm_load_si128((const __m128i *)a); + __m128i m2 = _mm_load_si128((const __m128i *)b); + __m128i cmp = _mm_cmpeq_epi16(m1, m2); + int mask = short(_mm_movemask_epi8(cmp)); // force sign extension + mask >>= garbage; + if (~mask) { + // which ushort isn't equal? + uint counter = (garbage + bsf_nonzero(~mask)); + return a[counter/2] - b[counter/2]; + } + + // the first 16-garbage bytes (8-garbage/2 ushorts) were equal + len -= 8 - garbage/2; + return ucstrncmp_sse2_aligned(a + 8, b + 8, len); +} + +template<int N> static inline +int ucstrncmp_ssse3_aligning2_alignr(const ushort *a, const ushort *b, int len, int garbage) +{ + // len >= 8 + __m128i lower, upper, merged; + lower = _mm_load_si128((const __m128i*)a); + upper = _mm_load_si128((const __m128i*)(a + 8)); + merged = _mm_alignr_epi8(upper, lower, N); + + __m128i m2 = _mm_load_si128((const __m128i*)b); + __m128i cmp = _mm_cmpeq_epi16(merged, m2); + int mask = short(_mm_movemask_epi8(cmp)); // force sign extension + mask >>= garbage; + if (~mask) { + // which ushort isn't equal? + uint counter = (garbage + bsf_nonzero(~mask)); + return a[counter/2 + N/2] - b[counter/2]; + } + + // the first 16-garbage bytes (8-garbage/2 ushorts) were equal + quintptr counter = 8; + len -= 8 - garbage/2; + while (len >= 8) { + lower = upper; + upper = _mm_load_si128((__m128i *)(a + counter) + 1); + merged = _mm_alignr_epi8(upper, lower, N); + + m2 = _mm_load_si128((__m128i *)(b + counter)); + cmp = _mm_cmpeq_epi16(merged, m2); + ushort mask = ~uint(_mm_movemask_epi8(cmp)); + if (mask) { + // which ushort isn't equal? + counter += bsf_nonzero(mask)/2; + return a[counter + N/2] - b[counter]; + } + + counter += 8; + len -= 8; + } + + return ucstrncmp_short_tail(a + counter + N/2, b + counter, len); +} + +static inline int conditional_invert(int result, bool invert) +{ + if (invert) + return -result; + return result; +} + +static int ucstrncmp_ssse3_aligning2(const ushort *a, const ushort *b, int len) +{ + // Different strategy from above: instead of doing two unaligned loads + // when trying to align, we'll only do aligned loads and round down the + // addresses of a and b. This means the first load will contain garbage + // in the beginning of the string, which we'll shift out of the way + // (after _mm_movemask_epi8) + + if (len < 8) + return ucstrncmp_intwise(a, b, len); + + // both a and b are misaligned + // we'll call the alignr function with the alignment *difference* between the two + int offset = (quintptr(a) & 0xf) - (quintptr(b) & 0xf); + if (offset >= 0) { + // from this point on, b has the shortest alignment + // and align(a) = align(b) + offset + // round down the alignment so align(b) == align(a) == 0 + int garbage = (quintptr(b) & 0xf); + a = (const ushort*)(quintptr(a) & ~0xf); + b = (const ushort*)(quintptr(b) & ~0xf); + + // now the first load of b will load 'garbage' extra bytes + // and the first load of a will load 'garbage + offset' extra bytes + if (offset == 8) + return ucstrncmp_ssse3_aligning2_alignr<8>(a, b, len, garbage); + if (offset == 0) + return ucstrncmp_ssse3_aligning2_aligned(a, b, len, garbage); + if (offset < 8) { + if (offset < 4) + return ucstrncmp_ssse3_aligning2_alignr<2>(a, b, len, garbage); + else if (offset == 4) + return ucstrncmp_ssse3_aligning2_alignr<4>(a, b, len, garbage); + else + return ucstrncmp_ssse3_aligning2_alignr<6>(a, b, len, garbage); + } else { + if (offset < 12) + return ucstrncmp_ssse3_aligning2_alignr<10>(a, b, len, garbage); + else if (offset == 12) + return ucstrncmp_ssse3_aligning2_alignr<12>(a, b, len, garbage); + else + return ucstrncmp_ssse3_aligning2_alignr<14>(a, b, len, garbage); + } + } else { + // same as above but inverted + int garbage = (quintptr(a) & 0xf); + a = (const ushort*)(quintptr(a) & ~0xf); + b = (const ushort*)(quintptr(b) & ~0xf); + + offset = -offset; + if (offset == 8) + return -ucstrncmp_ssse3_aligning2_alignr<8>(b, a, len, garbage); + if (offset < 8) { + if (offset < 4) + return -ucstrncmp_ssse3_aligning2_alignr<2>(b, a, len, garbage); + else if (offset == 4) + return -ucstrncmp_ssse3_aligning2_alignr<4>(b, a, len, garbage); + else + return -ucstrncmp_ssse3_aligning2_alignr<6>(b, a, len, garbage); + } else { + if (offset < 12) + return -ucstrncmp_ssse3_aligning2_alignr<10>(b, a, len, garbage); + else if (offset == 12) + return -ucstrncmp_ssse3_aligning2_alignr<12>(b, a, len, garbage); + else + return -ucstrncmp_ssse3_aligning2_alignr<14>(b, a, len, garbage); + } + } +} + +#endif +#endif + +typedef int (* UcstrncmpFunction)(const ushort *, const ushort *, int); +Q_DECLARE_METATYPE(UcstrncmpFunction) + +void tst_QString::ucstrncmp_data() const +{ + QTest::addColumn<UcstrncmpFunction>("function"); + QTest::newRow("selftest") << UcstrncmpFunction(0); + QTest::newRow("shortwise") << &ucstrncmp_shortwise; + QTest::newRow("intwise") << &ucstrncmp_intwise; +#ifdef __SSE2__ + QTest::newRow("sse2") << &ucstrncmp_sse2; + QTest::newRow("sse2_aligning") << &ucstrncmp_sse2_aligning; +#ifdef __SSSE3__ + QTest::newRow("ssse3") << &ucstrncmp_ssse3; + QTest::newRow("ssse3_aligning") << &ucstrncmp_ssse3_aligning; + QTest::newRow("ssse3_aligning2") << &ucstrncmp_ssse3_aligning2; +#endif +#endif +} + +void tst_QString::ucstrncmp() const +{ + QFETCH(UcstrncmpFunction, function); + if (!function) { + static const UcstrncmpFunction func[] = { + &ucstrncmp_shortwise, + &ucstrncmp_intwise, +#ifdef __SSE2__ + &ucstrncmp_sse2, + &ucstrncmp_sse2_aligning, +#ifdef __SSSE3__ + &ucstrncmp_ssse3, + &ucstrncmp_ssse3_aligning, + &ucstrncmp_ssse3_aligning2 +#endif +#endif + }; + static const int functionCount = sizeof func / sizeof func[0]; + +#ifdef Q_OS_UNIX + const long pagesize = sysconf(_SC_PAGESIZE); + void *page1, *page3; + ushort *page2; + page1 = mmap(0, pagesize, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + page2 = (ushort *)mmap(0, pagesize, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS | MAP_POPULATE, -1, 0); + page3 = mmap(0, pagesize, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + + Q_ASSERT(quintptr(page2) == quintptr(page1) + pagesize || quintptr(page2) == quintptr(page1) - pagesize); + Q_ASSERT(quintptr(page3) == quintptr(page2) + pagesize || quintptr(page3) == quintptr(page2) - pagesize); + munmap(page1, pagesize); + munmap(page3, pagesize); + + // populate our page + for (uint i = 0; i < pagesize / sizeof(long long); ++i) + ((long long *)page2)[i] = Q_INT64_C(0x0041004100410041); + + // the following should crash: + //page2[-1] = 0xdead; + //page2[pagesize / sizeof(ushort) + 1] = 0xbeef; + + static const ushort needle[] = { + 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, + 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, + 0x41 + }; + + for (int algo = 0; algo < functionCount; ++algo) { + // boundary condition test: + for (int i = 0; i < 8; ++i) { + (func[algo])(page2 + i, needle, sizeof needle / 2); + (func[algo])(page2 - i - 1 - sizeof(needle)/2 + pagesize/2, needle, sizeof needle/2); + } + } + + munmap(page2, pagesize); +#endif + + for (int algo = 0; algo < functionCount; ++algo) { + for (int i = 0; i < stringCollectionCount; ++i) { + const ushort *p1 = stringCollectionData + stringCollection[i].offset1; + const ushort *p2 = stringCollectionData + stringCollection[i].offset2; + int expected = ucstrncmp_shortwise(p1, p2, stringCollection[i].len); + expected = qBound(-1, expected, 1); + + int result = (func[algo])(p1, p2, stringCollection[i].len); + result = qBound(-1, result, 1); + if (expected != result) + qWarning().nospace() + << "algo=" << algo + << " i=" << i + << " failed (" << result << "!=" << expected + << "); strings were " + << QByteArray((char*)p1, stringCollection[i].len).toHex() + << " and " + << QByteArray((char*)p2, stringCollection[i].len).toHex(); + } + } + return; + } + + QBENCHMARK { + for (int i = 0; i < stringCollectionCount; ++i) { + const ushort *p1 = stringCollectionData + stringCollection[i].offset1; + const ushort *p2 = stringCollectionData + stringCollection[i].offset2; + (function)(p1, p2, stringCollection[i].len); + } + } +} + +void tst_QString::fromUtf8() const +{ + QFile file(SRCDIR "utf-8.txt"); + if (!file.open(QFile::ReadOnly)) { + qFatal("Cannot open input file"); + return; + } + QByteArray data = file.readAll(); + const char *d = data.constData(); + int size = data.size(); + + QBENCHMARK { + QString::fromUtf8(d, size); + } +} + +void tst_QString::fromLatin1_data() const +{ + QTest::addColumn<QByteArray>("latin1"); + + // make all the strings have the same length + QTest::newRow("ascii-only") << QByteArray("HelloWorld"); + QTest::newRow("ascii+control") << QByteArray("Hello\1\r\n\x7f\t"); + QTest::newRow("ascii+nul") << QByteArray("a\0zbc\0defg", 10); + QTest::newRow("non-ascii") << QByteArray("\x80\xc0\xff\x81\xc1\xfe\x90\xd0\xef\xa0"); +} + +void tst_QString::fromLatin1() const +{ + QFETCH(QByteArray, latin1); + + while (latin1.length() < 128) { + latin1 += latin1; + } + + QByteArray copy1 = latin1, copy2 = latin1, copy3 = latin1; + copy1.chop(1); + copy2.detach(); + copy3 += latin1; // longer length + copy2.clear(); + + QBENCHMARK { + QString s1 = QString::fromLatin1(latin1); + QString s2 = QString::fromLatin1(latin1); + QString s3 = QString::fromLatin1(copy1); + QString s4 = QString::fromLatin1(copy3); + s3 = QString::fromLatin1(copy3); + } +} + +typedef void (* FromLatin1Function)(ushort *, const char *, int); +Q_DECLARE_METATYPE(FromLatin1Function) + +void fromLatin1_regular(ushort *dst, const char *str, int size) +{ + // from qstring.cpp: + while (size--) + *dst++ = (uchar)*str++; +} + +#ifdef __SSE2__ +void fromLatin1_sse2_qt47(ushort *dst, const char *str, int size) +{ + if (size >= 16) { + int chunkCount = size >> 4; // divided by 16 + const __m128i nullMask = _mm_set1_epi32(0); + for (int i = 0; i < chunkCount; ++i) { + const __m128i chunk = _mm_loadu_si128((__m128i*)str); // load + str += 16; + + // unpack the first 8 bytes, padding with zeros + const __m128i firstHalf = _mm_unpacklo_epi8(chunk, nullMask); + _mm_storeu_si128((__m128i*)dst, firstHalf); // store + dst += 8; + + // unpack the last 8 bytes, padding with zeros + const __m128i secondHalf = _mm_unpackhi_epi8 (chunk, nullMask); + _mm_storeu_si128((__m128i*)dst, secondHalf); // store + dst += 8; + } + size = size % 16; + } + while (size--) + *dst++ = (uchar)*str++; +} + +static inline void fromLatin1_epilog(ushort *dst, const char *str, int size) +{ + if (!size) return; + dst[0] = (uchar)str[0]; + if (!--size) return; + dst[1] = (uchar)str[1]; + if (!--size) return; + dst[2] = (uchar)str[2]; + if (!--size) return; + dst[3] = (uchar)str[3]; + if (!--size) return; + dst[4] = (uchar)str[4]; + if (!--size) return; + dst[5] = (uchar)str[5]; + if (!--size) return; + dst[6] = (uchar)str[6]; + if (!--size) return; + dst[7] = (uchar)str[7]; + if (!--size) return; + dst[8] = (uchar)str[8]; + if (!--size) return; + dst[9] = (uchar)str[9]; + if (!--size) return; + dst[10] = (uchar)str[10]; + if (!--size) return; + dst[11] = (uchar)str[11]; + if (!--size) return; + dst[12] = (uchar)str[12]; + if (!--size) return; + dst[13] = (uchar)str[13]; + if (!--size) return; + dst[14] = (uchar)str[14]; + if (!--size) return; + dst[15] = (uchar)str[15]; +} + +void fromLatin1_sse2_improved(ushort *dst, const char *str, int size) +{ + const __m128i nullMask = _mm_set1_epi32(0); + qptrdiff counter = 0; + size -= 16; + while (size >= counter) { + const __m128i chunk = _mm_loadu_si128((__m128i*)(str + counter)); // load + + // unpack the first 8 bytes, padding with zeros + const __m128i firstHalf = _mm_unpacklo_epi8(chunk, nullMask); + _mm_storeu_si128((__m128i*)(dst + counter), firstHalf); // store + + // unpack the last 8 bytes, padding with zeros + const __m128i secondHalf = _mm_unpackhi_epi8 (chunk, nullMask); + _mm_storeu_si128((__m128i*)(dst + counter + 8), secondHalf); // store + + counter += 16; + } + size += 16; + fromLatin1_epilog(dst + counter, str + counter, size - counter); +} + +void fromLatin1_sse2_improved2(ushort *dst, const char *str, int size) +{ + const __m128i nullMask = _mm_set1_epi32(0); + qptrdiff counter = 0; + size -= 32; + while (size >= counter) { + const __m128i chunk1 = _mm_loadu_si128((__m128i*)(str + counter)); // load + const __m128i chunk2 = _mm_loadu_si128((__m128i*)(str + counter + 16)); // load + + // unpack the first 8 bytes, padding with zeros + const __m128i firstHalf1 = _mm_unpacklo_epi8(chunk1, nullMask); + _mm_storeu_si128((__m128i*)(dst + counter), firstHalf1); // store + + // unpack the last 8 bytes, padding with zeros + const __m128i secondHalf1 = _mm_unpackhi_epi8(chunk1, nullMask); + _mm_storeu_si128((__m128i*)(dst + counter + 8), secondHalf1); // store + + // unpack the first 8 bytes, padding with zeros + const __m128i firstHalf2 = _mm_unpacklo_epi8(chunk2, nullMask); + _mm_storeu_si128((__m128i*)(dst + counter + 16), firstHalf2); // store + + // unpack the last 8 bytes, padding with zeros + const __m128i secondHalf2 = _mm_unpackhi_epi8(chunk2, nullMask); + _mm_storeu_si128((__m128i*)(dst + counter + 24), secondHalf2); // store + + counter += 32; + } + size += 16; + if (size >= counter) { + const __m128i chunk = _mm_loadu_si128((__m128i*)(str + counter)); // load + + // unpack the first 8 bytes, padding with zeros + const __m128i firstHalf = _mm_unpacklo_epi8(chunk, nullMask); + _mm_storeu_si128((__m128i*)(dst + counter), firstHalf); // store + + // unpack the last 8 bytes, padding with zeros + const __m128i secondHalf = _mm_unpackhi_epi8 (chunk, nullMask); + _mm_storeu_si128((__m128i*)(dst + counter + 8), secondHalf); // store + + counter += 16; + } + size += 16; + fromLatin1_epilog(dst + counter, str + counter, size - counter); +} + +void fromLatin1_prolog_unrolled(ushort *dst, const char *str, int size) +{ + // QString's data pointer is most often ending in 0x2 or 0xa + // that means the two most common values for size are (8-1)=7 and (8-5)=3 + if (size == 7) + goto copy_7; + if (size == 3) + goto copy_3; + + if (size == 6) + goto copy_6; + if (size == 5) + goto copy_5; + if (size == 4) + goto copy_4; + if (size == 2) + goto copy_2; + if (size == 1) + goto copy_1; + return; + +copy_7: + dst[6] = (uchar)str[6]; +copy_6: + dst[5] = (uchar)str[5]; +copy_5: + dst[4] = (uchar)str[4]; +copy_4: + dst[3] = (uchar)str[3]; +copy_3: + dst[2] = (uchar)str[2]; +copy_2: + dst[1] = (uchar)str[1]; +copy_1: + dst[0] = (uchar)str[0]; +} + +void fromLatin1_prolog_sse2_overcommit(ushort *dst, const char *str, int) +{ + // do one iteration of conversion + const __m128i chunk = _mm_loadu_si128((__m128i*)str); // load + + // unpack only the first 8 bytes, padding with zeros + const __m128i nullMask = _mm_set1_epi32(0); + const __m128i firstHalf = _mm_unpacklo_epi8(chunk, nullMask); + _mm_storeu_si128((__m128i*)dst, firstHalf); // store +} + +template<FromLatin1Function prologFunction> +void fromLatin1_sse2_withprolog(ushort *dst, const char *str, int size) +{ + // same as the improved code, but we attempt to align at the prolog + // therefore, we issue aligned stores + + if (size >= 16) { + uint misalignment = uint(quintptr(dst) & 0xf); + uint prologCount = (16 - misalignment) / 2; + + prologFunction(dst, str, prologCount); + + size -= prologCount; + dst += prologCount; + str += prologCount; + } + + const __m128i nullMask = _mm_set1_epi32(0); + qptrdiff counter = 0; + size -= 16; + while (size >= counter) { + const __m128i chunk = _mm_loadu_si128((__m128i*)(str + counter)); // load + + // unpack the first 8 bytes, padding with zeros + const __m128i firstHalf = _mm_unpacklo_epi8(chunk, nullMask); + _mm_store_si128((__m128i*)(dst + counter), firstHalf); // store + + // unpack the last 8 bytes, padding with zeros + const __m128i secondHalf = _mm_unpackhi_epi8 (chunk, nullMask); + _mm_store_si128((__m128i*)(dst + counter + 8), secondHalf); // store + + counter += 16; + } + size += 16; + fromLatin1_epilog(dst + counter, str + counter, size - counter); +} + +#ifdef __SSE4_1__ +void fromLatin1_sse4_pmovzxbw(ushort *dst, const char *str, int size) +{ + qptrdiff counter = 0; + size -= 16; + while (size >= counter) { + __m128i chunk = _mm_loadu_si128((__m128i*)(str + counter)); // load + + // unpack the first 8 bytes, padding with zeros + const __m128i firstHalf = _mm_cvtepu8_epi16(chunk); + _mm_storeu_si128((__m128i*)(dst + counter), firstHalf); // store + + // unpack the last 8 bytes, padding with zeros + chunk = _mm_srli_si128(chunk, 8); + const __m128i secondHalf = _mm_cvtepu8_epi16(chunk); + _mm_storeu_si128((__m128i*)(dst + counter + 8), secondHalf); // store + + counter += 16; + } + size += 16; + fromLatin1_epilog(dst + counter, str + counter, size - counter); +} + +void fromLatin1_prolog_sse4_overcommit(ushort *dst, const char *str, int) +{ + // load 8 bytes and zero-extend them to 16 + const __m128i chunk = _mm_cvtepu8_epi16(*(__m128i*)str); // load + _mm_storeu_si128((__m128i*)dst, chunk); // store +} +#endif +#endif + +#ifdef __ARM_NEON__ +static inline void fromLatin1_epilog(ushort *dst, const char *str, int size) +{ + if (!size) return; + dst[0] = (uchar)str[0]; + if (!--size) return; + dst[1] = (uchar)str[1]; + if (!--size) return; + dst[2] = (uchar)str[2]; + if (!--size) return; + dst[3] = (uchar)str[3]; + if (!--size) return; + dst[4] = (uchar)str[4]; + if (!--size) return; + dst[5] = (uchar)str[5]; + if (!--size) return; + dst[6] = (uchar)str[6]; + if (!--size) return; + dst[7] = (uchar)str[7]; + if (!--size) return; +} + +void fromLatin1_neon_improved(ushort *dst, const char *str, int len) +{ + while (len >= 8) { + // load 8 bytes into one doubleword Neon register + const uint8x8_t chunk = vld1_u8((uint8_t *)str); + str += 8; + + // expand 8 bytes into 16 bytes in a quadword register + const uint16x8_t expanded = vmovl_u8(chunk); + vst1q_u16(dst, expanded); // store + dst += 8; + + len -= 8; + } + fromLatin1_epilog(dst, str, len); +} + +void fromLatin1_neon_improved2(ushort *dst, const char *str, int len) +{ + while (len >= 16) { + // load 16 bytes into one quadword Neon register + const uint8x16_t chunk = vld1q_u8((uint8_t *)str); + str += 16; + + // expand each doubleword of the quadword register into a quadword + const uint16x8_t expanded_low = vmovl_u8(vget_low_u8(chunk)); + vst1q_u16(dst, expanded_low); // store + dst += 8; + const uint16x8_t expanded_high = vmovl_u8(vget_high_u8(chunk)); + vst1q_u16(dst, expanded_high); // store + dst += 8; + + len -= 16; + } + + if (len >= 8) { + // load 8 bytes into one doubleword Neon register + const uint8x8_t chunk = vld1_u8((uint8_t *)str); + str += 8; + + // expand 8 bytes into 16 bytes in a quadword register + const uint16x8_t expanded = vmovl_u8(chunk); + vst1q_u16(dst, expanded); // store + dst += 8; + + len -= 8; + } + fromLatin1_epilog(dst, str, len); +} + +void fromLatin1_neon_handwritten(ushort *dst, const char *str, int len) +{ + // same as above, but handwritten Neon + while (len >= 8) { + uint16x8_t chunk; + asm ( + "vld1.8 %[chunk], [%[str]]!\n" + "vmovl.u8 %q[chunk], %[chunk]\n" + "vst1.16 %h[chunk], [%[dst]]!\n" + : [dst] "+r" (dst), + [str] "+r" (str), + [chunk] "=w" (chunk)); + len -= 8; + } + + fromLatin1_epilog(dst, str, len); +} + +void fromLatin1_neon_handwritten2(ushort *dst, const char *str, int len) +{ + // same as above, but handwritten Neon + while (len >= 16) { + uint16x8_t chunk1, chunk2; + asm ( + "vld1.8 %h[chunk1], [%[str]]!\n" + "vmovl.u8 %q[chunk2], %f[chunk1]\n" + "vmovl.u8 %q[chunk1], %e[chunk1]\n" + "vst1.16 %h[chunk1], [%[dst]]!\n" + "vst1.16 %h[chunk2], [%[dst]]!\n" + : [dst] "+r" (dst), + [str] "+r" (str), + [chunk1] "=w" (chunk1), + [chunk2] "=w" (chunk2)); + len -= 16; + } + + if (len >= 8) { + uint16x8_t chunk; + asm ( + "vld1.8 %[chunk], [%[str]]!\n" + "vmovl.u8 %q[chunk], %[chunk]\n" + "vst1.16 %h[chunk], [%[dst]]!\n" + : [dst] "+r" (dst), + [str] "+r" (str), + [chunk] "=w" (chunk)); + len -= 8; + } + + fromLatin1_epilog(dst, str, len); +} +#endif + +void tst_QString::fromLatin1Alternatives_data() const +{ + QTest::addColumn<FromLatin1Function>("function"); + QTest::newRow("empty") << FromLatin1Function(0); + QTest::newRow("regular") << &fromLatin1_regular; +#ifdef __SSE2__ + QTest::newRow("sse2-qt4.7") << &fromLatin1_sse2_qt47; + QTest::newRow("sse2-improved") << &fromLatin1_sse2_improved; + QTest::newRow("sse2-improved2") << &fromLatin1_sse2_improved2; + QTest::newRow("sse2-with-prolog-regular") << &fromLatin1_sse2_withprolog<&fromLatin1_regular>; + QTest::newRow("sse2-with-prolog-unrolled") << &fromLatin1_sse2_withprolog<&fromLatin1_prolog_unrolled>; + QTest::newRow("sse2-with-prolog-sse2-overcommit") << &fromLatin1_sse2_withprolog<&fromLatin1_prolog_sse2_overcommit>; +#ifdef __SSE4_1__ + QTest::newRow("sse2-with-prolog-sse4-overcommit") << &fromLatin1_sse2_withprolog<&fromLatin1_prolog_sse4_overcommit>; + QTest::newRow("sse4-pmovzxbw") << &fromLatin1_sse4_pmovzxbw; +#endif +#endif +#ifdef __ARM_NEON__ + QTest::newRow("neon-improved") << &fromLatin1_neon_improved; + QTest::newRow("neon-improved2") << &fromLatin1_neon_improved2; + QTest::newRow("neon-handwritten") << &fromLatin1_neon_handwritten; + QTest::newRow("neon-handwritten2") << &fromLatin1_neon_handwritten2; +#endif +} + +extern StringData fromLatin1Data; +static void fromLatin1Alternatives_internal(FromLatin1Function function, QString &dst, bool doVerify) +{ + struct Entry + { + int len; + int offset1, offset2; + int align1, align2; + }; + const Entry *entries = reinterpret_cast<const Entry *>(fromLatin1Data.entries); + + for (int i = 0; i < fromLatin1Data.entryCount; ++i) { + int len = entries[i].len; + const char *src = fromLatin1Data.charData + entries[i].offset1; + + if (!function) + continue; + if (!doVerify) { + (function)(&dst.data()->unicode(), src, len); + } else { + dst.fill(QChar('x'), dst.length()); + + (function)(&dst.data()->unicode() + 8, src, len); + + QString zeroes(8, QChar('x')); + QString final = dst.mid(8, len); + QCOMPARE(final, QString::fromLatin1(src, len)); + QCOMPARE(dst.left(8), zeroes); + QCOMPARE(dst.mid(len + 8, 8), zeroes); + } + } +} + +void tst_QString::fromLatin1Alternatives() const +{ + QFETCH(FromLatin1Function, function); + + QString dst(fromLatin1Data.maxLength + 16, QChar('x')); + fromLatin1Alternatives_internal(function, dst, true); + + QBENCHMARK { + fromLatin1Alternatives_internal(function, dst, false); + } +} + +typedef int (* FromUtf8Function)(ushort *, const char *, int); +Q_DECLARE_METATYPE(FromUtf8Function) + +extern QTextCodec::ConverterState *state; +QTextCodec::ConverterState *state = 0; // just because the code in qutfcodec.cpp uses a state + +int fromUtf8_latin1_regular(ushort *dst, const char *chars, int len) +{ + fromLatin1_regular(dst, chars, len); + return len; +} + +#ifdef __SSE2__ +int fromUtf8_latin1_qt47(ushort *dst, const char *chars, int len) +{ + fromLatin1_sse2_qt47(dst, chars, len); + return len; +} + +int fromUtf8_latin1_sse2_improved(ushort *dst, const char *chars, int len) +{ + fromLatin1_sse2_improved(dst, chars, len); + return len; +} +#endif + +static inline bool isUnicodeNonCharacter(uint ucs4) +{ + // Unicode has a couple of "non-characters" that one can use internally, + // but are not allowed to be used for text interchange. + // + // Those are the last two entries each Unicode Plane (U+FFFE, U+FFFF, + // U+1FFFE, U+1FFFF, etc.) as well as the entries between U+FDD0 and + // U+FDEF (inclusive) + + return (ucs4 & 0xfffe) == 0xfffe + || (ucs4 - 0xfdd0U) < 16; +} + +int fromUtf8_qt47(ushort *dst, const char *chars, int len) +{ + // this is almost the code found in Qt 4.7's qutfcodec.cpp QUtf8Codec::convertToUnicode + // That function returns a QString, this one returns the number of characters converted + // That's to avoid doing malloc() inside the benchmark test + // Any differences between this code and the original are just because of that, I promise + + bool headerdone = false; + ushort replacement = QChar::ReplacementCharacter; + int need = 0; + int error = -1; + uint uc = 0; + uint min_uc = 0; + if (state) { + if (state->flags & QTextCodec::IgnoreHeader) + headerdone = true; + if (state->flags & QTextCodec::ConvertInvalidToNull) + replacement = QChar::Null; + need = state->remainingChars; + if (need) { + uc = state->state_data[0]; + min_uc = state->state_data[1]; + } + } + if (!headerdone && len > 3 + && (uchar)chars[0] == 0xef && (uchar)chars[1] == 0xbb && (uchar)chars[2] == 0xbf) { + // starts with a byte order mark + chars += 3; + len -= 3; + headerdone = true; + } + + // QString result(need + len + 1, Qt::Uninitialized); // worst case + // ushort *qch = (ushort *)result.unicode(); + ushort *qch = dst; + uchar ch; + int invalid = 0; + + for (int i = 0; i < len; ++i) { + ch = chars[i]; + if (need) { + if ((ch&0xc0) == 0x80) { + uc = (uc << 6) | (ch & 0x3f); + --need; + if (!need) { + // utf-8 bom composes into 0xfeff code point + bool nonCharacter; + if (!headerdone && uc == 0xfeff) { + // don't do anything, just skip the BOM + } else if (!(nonCharacter = isUnicodeNonCharacter(uc)) && uc > 0xffff && uc < 0x110000) { + // surrogate pair + //Q_ASSERT((qch - (ushort*)result.unicode()) + 2 < result.length()); + *qch++ = QChar::highSurrogate(uc); + *qch++ = QChar::lowSurrogate(uc); + } else if ((uc < min_uc) || (uc >= 0xd800 && uc <= 0xdfff) || nonCharacter || uc >= 0x110000) { + // error: overlong sequence, UTF16 surrogate or non-character + *qch++ = replacement; + ++invalid; + } else { + *qch++ = uc; + } + headerdone = true; + } + } else { + // error + i = error; + *qch++ = replacement; + ++invalid; + need = 0; + headerdone = true; + } + } else { + if (ch < 128) { + *qch++ = ushort(ch); + headerdone = true; + } else if ((ch & 0xe0) == 0xc0) { + uc = ch & 0x1f; + need = 1; + error = i; + min_uc = 0x80; + headerdone = true; + } else if ((ch & 0xf0) == 0xe0) { + uc = ch & 0x0f; + need = 2; + error = i; + min_uc = 0x800; + } else if ((ch&0xf8) == 0xf0) { + uc = ch & 0x07; + need = 3; + error = i; + min_uc = 0x10000; + headerdone = true; + } else { + // error + *qch++ = replacement; + ++invalid; + headerdone = true; + } + } + } + if (!state && need > 0) { + // unterminated UTF sequence + for (int i = error; i < len; ++i) { + *qch++ = replacement; + ++invalid; + } + } + //result.truncate(qch - (ushort *)result.unicode()); + if (state) { + state->invalidChars += invalid; + state->remainingChars = need; + if (headerdone) + state->flags |= QTextCodec::IgnoreHeader; + state->state_data[0] = need ? uc : 0; + state->state_data[1] = need ? min_uc : 0; + } + //return result; + return qch - dst; +} + +int fromUtf8_qt47_stateless(ushort *dst, const char *chars, int len) +{ + // This is the same code as above, but for stateless UTF-8 conversion + // no other improvements + bool headerdone = false; + const ushort replacement = QChar::ReplacementCharacter; + int need = 0; + int error = -1; + uint uc = 0; + uint min_uc = 0; + + if (len > 3 + && (uchar)chars[0] == 0xef && (uchar)chars[1] == 0xbb && (uchar)chars[2] == 0xbf) { + // starts with a byte order mark + chars += 3; + len -= 3; + } + + // QString result(need + len + 1, Qt::Uninitialized); // worst case + // ushort *qch = (ushort *)result.unicode(); + ushort *qch = dst; + uchar ch; + int invalid = 0; + + for (int i = 0; i < len; ++i) { + ch = chars[i]; + if (need) { + if ((ch&0xc0) == 0x80) { + uc = (uc << 6) | (ch & 0x3f); + --need; + if (!need) { + // utf-8 bom composes into 0xfeff code point + bool nonCharacter; + if (!headerdone && uc == 0xfeff) { + // don't do anything, just skip the BOM + } else if (!(nonCharacter = isUnicodeNonCharacter(uc)) && uc > 0xffff && uc < 0x110000) { + // surrogate pair + //Q_ASSERT((qch - (ushort*)result.unicode()) + 2 < result.length()); + *qch++ = QChar::highSurrogate(uc); + *qch++ = QChar::lowSurrogate(uc); + } else if ((uc < min_uc) || (uc >= 0xd800 && uc <= 0xdfff) || nonCharacter || uc >= 0x110000) { + // error: overlong sequence, UTF16 surrogate or non-character + *qch++ = replacement; + ++invalid; + } else { + *qch++ = uc; + } + headerdone = true; + } + } else { + // error + i = error; + *qch++ = replacement; + ++invalid; + need = 0; + headerdone = true; + } + } else { + if (ch < 128) { + *qch++ = ushort(ch); + headerdone = true; + } else if ((ch & 0xe0) == 0xc0) { + uc = ch & 0x1f; + need = 1; + error = i; + min_uc = 0x80; + headerdone = true; + } else if ((ch & 0xf0) == 0xe0) { + uc = ch & 0x0f; + need = 2; + error = i; + min_uc = 0x800; + } else if ((ch&0xf8) == 0xf0) { + uc = ch & 0x07; + need = 3; + error = i; + min_uc = 0x10000; + headerdone = true; + } else { + // error + *qch++ = replacement; + ++invalid; + headerdone = true; + } + } + } + if (need > 0) { + // unterminated UTF sequence + for (int i = error; i < len; ++i) { + *qch++ = replacement; + ++invalid; + } + } + //result.truncate(qch - (ushort *)result.unicode()); + //return result; + return qch - dst; +} + +template <bool trusted> +static inline void extract_utf8_multibyte(ushort *&dst, const char *&chars, qptrdiff &counter, int &len) +{ + uchar ch = chars[counter]; + + // is it a leading or a continuation one? + if (!trusted && (ch & 0xc0) == 0x80) { + // continuation character found without the leading + dst[counter++] = QChar::ReplacementCharacter; + return; + } + + if ((ch & 0xe0) == 0xc0) { + // two-byte UTF-8 sequence + if (!trusted && counter + 1 == len) { + dst[counter++] = QChar::ReplacementCharacter; + return; + } + + uchar ch2 = chars[counter + 1]; + if (!trusted) + if ((ch2 & 0xc0) != 0x80) { + dst[counter++] = QChar::ReplacementCharacter; + return; + } + + ushort ucs = (ch & 0x1f); + ucs <<= 6; + ucs |= (ch2 & 0x3f); + + // dst[counter] will correspond to chars[counter..counter+1], so adjust + ++chars; + --len; + if (trusted || ucs >= 0x80) + dst[counter] = ucs; + else + dst[counter] = QChar::ReplacementCharacter; + ++counter; + return; + } + + if ((ch & 0xf0) == 0xe0) { + // three-byte UTF-8 sequence + if (!trusted && counter + 2 >= len) { + dst[counter++] = QChar::ReplacementCharacter; + return; + } + + uchar ch2 = chars[counter + 1]; + uchar ch3 = chars[counter + 2]; + if (!trusted) + if ((ch2 & 0xc0) != 0x80 || (ch3 & 0xc0) != 0x80) { + dst[counter++] = QChar::ReplacementCharacter; + return; + } + + ushort ucs = (ch & 0x1f) << 12 | (ch2 & 0x3f) << 6 | (ch3 & 0x3f); + + // dst[counter] will correspond to chars[counter..counter+2], so adjust + chars += 2; + len -= 2; + if (!trusted && + (ucs < 0x800 || isUnicodeNonCharacter(ucs) || (ucs >= 0xd800 && ucs <= 0xdfff))) + dst[counter] = QChar::ReplacementCharacter; + else + dst[counter] = ucs; + ++counter; + return; + } + + if ((ch & 0xf8) == 0xf0) { + // four-byte UTF-8 sequence + // will require an UTF-16 surrogate pair + if (!trusted && counter + 3 >= len) { + dst[counter++] = QChar::ReplacementCharacter; + return; + } + + uchar ch2 = chars[counter + 1]; + uchar ch3 = chars[counter + 2]; + uchar ch4 = chars[counter + 3]; + if (!trusted) + if ((ch2 & 0xc0) != 0x80 || (ch3 & 0xc0) != 0x80 || (ch4 & 0xc0) != 0x80) { + dst[counter++] = QChar::ReplacementCharacter; + return; + } + + uint ucs = (ch & 0x1f) << 18 | (ch2 & 0x3f) << 12 + | (ch3 & 0x3f) << 6 | (ch4 & 0x3f); + + // dst[counter] will correspond to chars[counter..counter+2], so adjust + chars += 3; + len -= 3; + if (trusted || (ucs >= 0x10000 && ucs < 0x110000 && !isUnicodeNonCharacter(ucs))) { + dst[counter + 0] = QChar::highSurrogate(ucs); + dst[counter + 1] = QChar::lowSurrogate(ucs); + counter += 2; + } else { + dst[counter++] = QChar::ReplacementCharacter; + } + return; + } + + ++counter; +} + +int fromUtf8_optimised_for_ascii(ushort *qch, const char *chars, int len) +{ + if (len > 3 + && (uchar)chars[0] == 0xef && (uchar)chars[1] == 0xbb && (uchar)chars[2] == 0xbf) { + // starts with a byte order mark + chars += 3; + len -= 3; + } + + qptrdiff counter = 0; + ushort *dst = qch; + while (counter < len) { + uchar ch = chars[counter]; + if ((ch & 0x80) == 0) { + dst[counter] = ch; + ++counter; + continue; + } + + // UTF-8 character found + extract_utf8_multibyte<false>(dst, chars, counter, len); + } + return dst + counter - qch; +} + +#ifdef __SSE2__ +int fromUtf8_sse2_optimised_for_ascii(ushort *qch, const char *chars, int len) +{ + if (len > 3 + && (uchar)chars[0] == 0xef && (uchar)chars[1] == 0xbb && (uchar)chars[2] == 0xbf) { + // starts with a byte order mark + chars += 3; + len -= 3; + } + + qptrdiff counter = 0; + ushort *dst = qch; + + len -= 16; + const __m128i nullMask = _mm_set1_epi32(0); + while (counter < len) { + const __m128i chunk = _mm_loadu_si128((__m128i*)(chars + counter)); // load + ushort highbytes = _mm_movemask_epi8(chunk); + + // unpack the first 8 bytes, padding with zeros + const __m128i firstHalf = _mm_unpacklo_epi8(chunk, nullMask); + _mm_storeu_si128((__m128i*)(dst + counter), firstHalf); // store + + if (!uchar(highbytes)) { + // unpack the last 8 bytes, padding with zeros + const __m128i secondHalf = _mm_unpackhi_epi8 (chunk, nullMask); + _mm_storeu_si128((__m128i*)(dst + counter + 8), secondHalf); // store + + if (!highbytes) { + counter += 16; + continue; + } + } + + // UTF-8 character found + // which one? + counter += bsf_nonzero(highbytes); + len += 16; + extract_utf8_multibyte<false>(dst, chars, counter, len); + len -= 16; + } + len += 16; + + while (counter < len) { + uchar ch = chars[counter]; + if ((ch & 0x80) == 0) { + dst[counter] = ch; + ++counter; + continue; + } + + // UTF-8 character found + extract_utf8_multibyte<false>(dst, chars, counter, len); + } + return dst + counter - qch; +} + +int fromUtf8_sse2_trusted_no_bom(ushort *qch, const char *chars, int len) +{ + qptrdiff counter = 0; + ushort *dst = qch; + + len -= 16; + const __m128i nullMask = _mm_set1_epi32(0); + while (counter < len) { + const __m128i chunk = _mm_loadu_si128((__m128i*)(chars + counter)); // load + ushort highbytes = _mm_movemask_epi8(chunk); + + // unpack the first 8 bytes, padding with zeros + const __m128i firstHalf = _mm_unpacklo_epi8(chunk, nullMask); + _mm_storeu_si128((__m128i*)(dst + counter), firstHalf); // store + + if (!uchar(highbytes)) { + // unpack the last 8 bytes, padding with zeros + const __m128i secondHalf = _mm_unpackhi_epi8 (chunk, nullMask); + _mm_storeu_si128((__m128i*)(dst + counter + 8), secondHalf); // store + + if (!highbytes) { + counter += 16; + continue; + } + } + + // UTF-8 character found + // which one? + counter += bsf_nonzero(highbytes); + len += 16; + extract_utf8_multibyte<true>(dst, chars, counter, len); + len -= 16; + } + len += 16; + + while (counter < len) { + uchar ch = chars[counter]; + if ((ch & 0x80) == 0) { + dst[counter] = ch; + ++counter; + continue; + } + + // UTF-8 character found + extract_utf8_multibyte<true>(dst, chars, counter, len); + } + return dst + counter - qch; +} +#endif + +#ifdef __ARM_NEON__ +int fromUtf8_latin1_neon(ushort *dst, const char *chars, int len) +{ + fromLatin1_neon_improved(dst, chars, len); + return len; +} + +int fromUtf8_neon(ushort *qch, const char *chars, int len) +{ + if (len > 3 + && (uchar)chars[0] == 0xef && (uchar)chars[1] == 0xbb && (uchar)chars[2] == 0xbf) { + // starts with a byte order mark + chars += 3; + len -= 3; + } + + ushort *dst = qch; + const uint8x8_t highBit = vdup_n_u8(0x80); + while (len >= 8) { + // load 8 bytes into one doubleword Neon register + const uint8x8_t chunk = vld1_u8((uint8_t *)chars); + const uint16x8_t expanded = vmovl_u8(chunk); + vst1q_u16(dst, expanded); + + uint8x8_t highBits = vtst_u8(chunk, highBit); + // we need to find the lowest byte set + int mask_low = vget_lane_u32(vreinterpret_u32_u8(highBits), 0); + int mask_high = vget_lane_u32(vreinterpret_u32_u8(highBits), 1); + + if (__builtin_expect(mask_low == 0 && mask_high == 0, 1)) { + chars += 8; + dst += 8; + len -= 8; + } else { + // UTF-8 character found + // which one? + qptrdiff pos; + asm ("rbit %0, %1\n" + "clz %1, %1\n" + : "=r" (pos) + : "r" (mask_low ? mask_low : mask_high)); + // now mask_low contains the number of leading zeroes + // or the value 32 (0x20) if no zeroes were found + // the number of leading zeroes is 8*pos + pos /= 8; + + extract_utf8_multibyte<false>(dst, chars, pos, len); + chars += pos; + dst += pos; + len -= pos; + } + } + + qptrdiff counter = 0; + while (counter < len) { + uchar ch = chars[counter]; + if ((ch & 0x80) == 0) { + dst[counter] = ch; + ++counter; + continue; + } + // UTF-8 character found + extract_utf8_multibyte<false>(dst, chars, counter, len); + } + return dst + counter - qch; +} + +int fromUtf8_neon_trusted(ushort *qch, const char *chars, int len) +{ + ushort *dst = qch; + const uint8x8_t highBit = vdup_n_u8(0x80); + while (len >= 8) { + // load 8 bytes into one doubleword Neon register + const uint8x8_t chunk = vld1_u8((uint8_t *)chars); + const uint16x8_t expanded = vmovl_u8(chunk); + vst1q_u16(dst, expanded); + + uint8x8_t highBits = vtst_u8(chunk, highBit); + // we need to find the lowest byte set + int mask_low = vget_lane_u32(vreinterpret_u32_u8(highBits), 0); + int mask_high = vget_lane_u32(vreinterpret_u32_u8(highBits), 1); + + if (__builtin_expect(mask_low == 0 && mask_high == 0, 1)) { + chars += 8; + dst += 8; + len -= 8; + } else { + // UTF-8 character found + // which one? + qptrdiff pos; + asm ("rbit %0, %1\n" + "clz %1, %1\n" + : "=r" (pos) + : "r" (mask_low ? mask_low : mask_high)); + // now mask_low contains the number of leading zeroes + // or the value 32 (0x20) if no zeroes were found + // the number of leading zeroes is 8*pos + pos /= 8; + + extract_utf8_multibyte<true>(dst, chars, pos, len); + chars += pos; + dst += pos; + len -= pos; + } + } + + qptrdiff counter = 0; + while (counter < len) { + uchar ch = chars[counter]; + if ((ch & 0x80) == 0) { + dst[counter] = ch; + ++counter; + continue; + } + + // UTF-8 character found + extract_utf8_multibyte<true>(dst, chars, counter, len); + } + return dst + counter - qch; +} +#endif + +void tst_QString::fromUtf8Alternatives_data() const +{ + QTest::addColumn<FromUtf8Function>("function"); + QTest::newRow("empty") << FromUtf8Function(0); + QTest::newRow("qt-4.7") << &fromUtf8_qt47; + QTest::newRow("qt-4.7-stateless") << &fromUtf8_qt47_stateless; + QTest::newRow("optimized-for-ascii") << &fromUtf8_optimised_for_ascii; +#ifdef __SSE2__ + QTest::newRow("sse2-optimized-for-ascii") << &fromUtf8_sse2_optimised_for_ascii; + QTest::newRow("sse2-trusted-no-bom") << &fromUtf8_sse2_trusted_no_bom; +#endif +#ifdef __ARM_NEON__ + QTest::newRow("neon") << &fromUtf8_neon; + QTest::newRow("neon-trusted-no-bom") << &fromUtf8_neon_trusted; +#endif + + QTest::newRow("latin1-generic") << &fromUtf8_latin1_regular; +#ifdef __SSE2__ + QTest::newRow("latin1-sse2-qt4.7") << &fromUtf8_latin1_qt47; + QTest::newRow("latin1-sse2-improved") << &fromUtf8_latin1_sse2_improved; +#endif +#ifdef __ARM_NEON__ + QTest::newRow("latin1-neon-improved") << &fromUtf8_latin1_neon; +#endif +} + +extern StringData fromUtf8Data; +static void fromUtf8Alternatives_internal(FromUtf8Function function, QString &dst, bool doVerify) +{ + if (!doVerify) { + // NOTE: this only works because the Latin1 data is ASCII-only + fromLatin1Alternatives_internal(reinterpret_cast<FromLatin1Function>(function), dst, doVerify); + } else { + if (strncmp(QTest::currentDataTag(), "latin1-", 7) == 0) + return; + } + + struct Entry + { + int len; + int offset1, offset2; + int align1, align2; + }; + const Entry *entries = reinterpret_cast<const Entry *>(fromUtf8Data.entries); + + for (int i = 0; i < fromUtf8Data.entryCount; ++i) { + int len = entries[i].len; + const char *src = fromUtf8Data.charData + entries[i].offset1; + + if (!function) + continue; + if (!doVerify) { + (function)(&dst.data()->unicode(), src, len); + } else { + dst.fill(QChar('x'), dst.length()); + + int utf8len = (function)(&dst.data()->unicode() + 8, src, len); + + QString expected = QString::fromUtf8(src, len); + QString final = dst.mid(8, expected.length()); + if (final != expected || utf8len != expected.length()) + qDebug() << i << entries[i].offset1 << utf8len << final << expected.length() << expected; + + QCOMPARE(final, expected); + QCOMPARE(utf8len, expected.length()); + + QString zeroes(8, QChar('x')); + QCOMPARE(dst.left(8), zeroes); + QCOMPARE(dst.mid(len + 8, 8), zeroes); + } + } +} + +void tst_QString::fromUtf8Alternatives() const +{ + QFETCH(FromUtf8Function, function); + + QString dst(fromUtf8Data.maxLength + 16, QChar('x')); + fromUtf8Alternatives_internal(function, dst, true); + + QBENCHMARK { + fromUtf8Alternatives_internal(function, dst, false); + } +} + +QTEST_MAIN(tst_QString) + +#include "main.moc" |