From 38be0d13830efd2d98281c645c3a60afe05ffece Mon Sep 17 00:00:00 2001
From: Qt by Nokia <qt-info@nokia.com>
Date: Wed, 27 Apr 2011 12:05:43 +0200
Subject: Initial import from the monolithic Qt.

This is the beginning of revision history for this module. If you
want to look at revision history older than this, please refer to the
Qt Git wiki for how to use Git history grafting. At the time of
writing, this wiki is located here:

http://qt.gitorious.org/qt/pages/GitIntroductionWithQt

If you have already performed the grafting and you don't see any
history beyond this commit, try running "git log" with the "--follow"
argument.

Branched from the monolithic repo, Qt master branch, at commit
896db169ea224deb96c59ce8af800d019de63f12
---
 tests/benchmarks/corelib/tools/qstring/main.cpp | 2601 +++++++++++++++++++++++
 1 file changed, 2601 insertions(+)
 create mode 100644 tests/benchmarks/corelib/tools/qstring/main.cpp

(limited to 'tests/benchmarks/corelib/tools/qstring/main.cpp')
diff --git a/tests/benchmarks/corelib/tools/qstring/main.cpp b/tests/benchmarks/corelib/tools/qstring/main.cpp
new file mode 100644
index 0000000000..96f2c30cf4
--- /dev/null
+++ b/tests/benchmarks/corelib/tools/qstring/main.cpp
@@ -0,0 +1,2601 @@
+/****************************************************************************
+**
+** Copyright (C) 2011 Nokia Corporation and/or its subsidiary(-ies).
+** All rights reserved.
+** Contact: Nokia Corporation (qt-info@nokia.com)
+**
+** This file is part of the test suite of the Qt Toolkit.
+**
+** $QT_BEGIN_LICENSE:LGPL$
+** No Commercial Usage
+** This file contains pre-release code and may not be distributed.
+** You may use this file in accordance with the terms and conditions
+** contained in the Technology Preview License Agreement accompanying
+** this package.
+**
+** GNU Lesser General Public License Usage
+** Alternatively, this file may be used under the terms of the GNU Lesser
+** General Public License version 2.1 as published by the Free Software
+** Foundation and appearing in the file LICENSE.LGPL included in the
+** packaging of this file.  Please review the following information to
+** ensure the GNU Lesser General Public License version 2.1 requirements
+** will be met: http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html.
+**
+** In addition, as a special exception, Nokia gives you certain additional
+** rights.  These rights are described in the Nokia Qt LGPL Exception
+** version 1.1, included in the file LGPL_EXCEPTION.txt in this package.
+**
+** If you have questions regarding the use of this file, please contact
+** Nokia at qt-info@nokia.com.
+**
+**
+**
+**
+**
+**
+**
+**
+** $QT_END_LICENSE$
+**
+****************************************************************************/
+#include <QStringList>
+#include <QFile>
+#include <QtTest/QtTest>
+
+#ifdef Q_OS_SYMBIAN
+// In Symbian OS test data is located in applications private dir
+// Application private dir is default serach path for files, so SRCDIR can be set to empty
+#define SRCDIR ""
+#endif
+
+#ifdef Q_OS_UNIX
+#include <sys/mman.h>
+#include <unistd.h>
+#endif
+
+#include <private/qsimd_p.h>
+
+#include "data.h"
+
+class tst_QString: public QObject
+{
+    Q_OBJECT
+public:
+    tst_QString();
+private slots:
+    void equals() const;
+    void equals_data() const;
+    void equals2_data() const;
+    void equals2() const;
+    void ucstrncmp_data() const;
+    void ucstrncmp() const;
+    void fromUtf8() const;
+    void fromLatin1_data() const;
+    void fromLatin1() const;
+    void fromLatin1Alternatives_data() const;
+    void fromLatin1Alternatives() const;
+    void fromUtf8Alternatives_data() const;
+    void fromUtf8Alternatives() const;
+};
+
+void tst_QString::equals() const
+{
+    QFETCH(QString, a);
+    QFETCH(QString, b);
+
+    QBENCHMARK {
+        a == b;
+    }
+}
+
+tst_QString::tst_QString()
+{
+}
+
+void tst_QString::equals_data() const
+{
+    static const struct {
+        ushort data[80];
+        int dummy;              // just to ensure 4-byte alignment
+    } data = {
+        {
+            64, 64, 64, 64,  64, 64, 64, 64,
+            64, 64, 64, 64,  64, 64, 64, 64, // 16
+            64, 64, 64, 64,  64, 64, 64, 64,
+            64, 64, 64, 64,  64, 64, 64, 64, // 32
+            64, 64, 64, 64,  64, 64, 64, 64,
+            64, 64, 64, 64,  64, 64, 64, 64, // 48
+            64, 64, 64, 64,  64, 64, 64, 64,
+            64, 64, 64, 64,  64, 64, 64, 64, // 64
+            64, 64, 64, 64,  96, 96, 96, 96,
+            64, 64, 96, 96,  96, 96, 96, 96  // 80
+        }, 0
+    };
+    const QChar *ptr = reinterpret_cast<const QChar *>(data.data);
+
+    QTest::addColumn<QString>("a");
+    QTest::addColumn<QString>("b");
+    QString base = QString::fromRawData(ptr, 64);
+
+    QTest::newRow("different-length") << base << QString::fromRawData(ptr, 4);
+    QTest::newRow("same-string") << base << base;
+    QTest::newRow("same-data") << base << QString::fromRawData(ptr, 64);
+
+    // try to avoid crossing a cache line (that is, at ptr[64])
+    QTest::newRow("aligned-aligned-4n")
+            << QString::fromRawData(ptr, 60) << QString::fromRawData(ptr + 2, 60);
+    QTest::newRow("aligned-unaligned-4n")
+            << QString::fromRawData(ptr, 60) << QString::fromRawData(ptr + 1, 60);
+    QTest::newRow("unaligned-unaligned-4n")
+            << QString::fromRawData(ptr + 1, 60) << QString::fromRawData(ptr + 3, 60);
+
+    QTest::newRow("aligned-aligned-4n+1")
+            << QString::fromRawData(ptr, 61) << QString::fromRawData(ptr + 2, 61);
+    QTest::newRow("aligned-unaligned-4n+1")
+            << QString::fromRawData(ptr, 61) << QString::fromRawData(ptr + 1, 61);
+    QTest::newRow("unaligned-unaligned-4n+1")
+            << QString::fromRawData(ptr + 1, 61) << QString::fromRawData(ptr + 3, 61);
+
+    QTest::newRow("aligned-aligned-4n-1")
+            << QString::fromRawData(ptr, 59) << QString::fromRawData(ptr + 2, 59);
+    QTest::newRow("aligned-unaligned-4n-1")
+            << QString::fromRawData(ptr, 59) << QString::fromRawData(ptr + 1, 59);
+    QTest::newRow("unaligned-unaligned-4n-1")
+            << QString::fromRawData(ptr + 1, 59) << QString::fromRawData(ptr + 3, 59);
+
+    QTest::newRow("aligned-aligned-2n")
+            << QString::fromRawData(ptr, 58) << QString::fromRawData(ptr + 2, 58);
+    QTest::newRow("aligned-unaligned-2n")
+            << QString::fromRawData(ptr, 58) << QString::fromRawData(ptr + 1, 58);
+    QTest::newRow("unaligned-unaligned-2n")
+            << QString::fromRawData(ptr + 1, 58) << QString::fromRawData(ptr + 3, 58);
+}
+
+static bool equals2_memcmp_call(const ushort *p1, const ushort *p2, int len)
+{
+    return memcmp(p1, p2, len * 2) == 0;
+}
+
+static bool equals2_bytewise(const ushort *p1, const ushort *p2, int len)
+{
+    if (p1 == p2 || !len)
+        return true;
+    uchar *b1 = (uchar *)p1;
+    uchar *b2 = (uchar *)p2;
+    len *= 2;
+    while (len--)
+        if (*b1++ != *b2++)
+            return false;
+    return true;
+}
+
+static bool equals2_shortwise(const ushort *p1, const ushort *p2, int len)
+{
+    if (p1 == p2 || !len)
+        return true;
+//    for (register int counter; counter < len; ++counter)
+//        if (p1[counter] != p2[counter])
+//            return false;
+    while (len--) {
+        if (p1[len] != p2[len])
+            return false;
+    }
+    return true;
+}
+
+static bool equals2_intwise(const ushort *p1, const ushort *p2, int length)
+{
+    if (p1 == p2 || !length)
+        return true;
+    register union {
+        const quint16 *w;
+        const quint32 *d;
+        quintptr value;
+    } sa, sb;
+    sa.w = p1;
+    sb.w = p2;
+
+    // check alignment
+    if ((sa.value & 2) == (sb.value & 2)) {
+        // both addresses have the same alignment
+        if (sa.value & 2) {
+            // both addresses are not aligned to 4-bytes boundaries
+            // compare the first character
+            if (*sa.w != *sb.w)
+                return false;
+            --length;
+            ++sa.w;
+            ++sb.w;
+
+            // now both addresses are 4-bytes aligned
+        }
+
+        // both addresses are 4-bytes aligned
+        // do a fast 32-bit comparison
+        register const quint32 *e = sa.d + (length >> 1);
+        for ( ; sa.d != e; ++sa.d, ++sb.d) {
+            if (*sa.d != *sb.d)
+                return false;
+        }
+
+        // do we have a tail?
+        return (length & 1) ? *sa.w == *sb.w : true;
+    } else {
+        // one of the addresses isn't 4-byte aligned but the other is
+        register const quint16 *e = sa.w + length;
+        for ( ; sa.w != e; ++sa.w, ++sb.w) {
+            if (*sa.w != *sb.w)
+                return false;
+        }
+    }
+    return true;
+}
+
+static inline bool equals2_short_tail(const ushort *p1, const ushort *p2, int len)
+{
+    if (len) {
+        if (*p1 != *p2)
+            return false;
+        if (--len) {
+            if (p1[1] != p2[1])
+                return false;
+            if (--len) {
+                if (p1[2] != p2[2])
+                    return false;
+                if (--len) {
+                    if (p1[3] != p2[3])
+                        return false;
+                    if (--len) {
+                        if (p1[4] != p2[4])
+                            return false;
+                        if (--len) {
+                            if (p1[5] != p2[5])
+                                return false;
+                            if (--len) {
+                                if (p1[6] != p2[6])
+                                    return false;
+                                return p1[7] == p2[7];
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+    return true;
+}
+
+//#pragma GCC optimize("no-unroll-loops")
+#ifdef __SSE2__
+static bool equals2_sse2_aligned(const ushort *p1, const ushort *p2, int len)
+{
+    if (len >= 8) {
+        qptrdiff counter = 0;
+        while (len > 8) {
+            __m128i q1 = _mm_load_si128((__m128i *)(p1 + counter));
+            __m128i q2 = _mm_load_si128((__m128i *)(p2 + counter));
+            __m128i cmp = _mm_cmpeq_epi16(q1, q2);
+            if (ushort(_mm_movemask_epi8(cmp)) != ushort(0xffff))
+                return false;
+
+            len -= 8;
+            counter += 8;
+        }
+        p1 += counter;
+        p2 += counter;
+    }
+
+    return equals2_short_tail(p1, p2, len);
+}
+
+static bool equals2_sse2(const ushort *p1, const ushort *p2, int len)
+{
+    if (p1 == p2 || !len)
+        return true;
+
+    if (len >= 8) {
+        qptrdiff counter = 0;
+        while (len >= 8) {
+            __m128i q1 = _mm_loadu_si128((__m128i *)(p1 + counter));
+            __m128i q2 = _mm_loadu_si128((__m128i *)(p2 + counter));
+            __m128i cmp = _mm_cmpeq_epi16(q1, q2);
+            if (ushort(_mm_movemask_epi8(cmp)) != 0xffff)
+                return false;
+
+            len -= 8;
+            counter += 8;
+        }
+        p1 += counter;
+        p2 += counter;
+    }
+
+    return equals2_short_tail(p1, p2, len);
+}
+
+//static bool equals2_sse2(const ushort *p1, const ushort *p2, int len)
+//{
+//    register int val1 = quintptr(p1) & 0xf;
+//    register int val2 = quintptr(p2) & 0xf;
+//    if (false && val1 + val2 == 0)
+//        return equals2_sse2_aligned(p1, p2, len);
+//    else
+//        return equals2_sse2_unaligned(p1, p2, len);
+//}
+
+static bool equals2_sse2_aligning(const ushort *p1, const ushort *p2, int len)
+{
+    if (len < 8)
+        return equals2_short_tail(p1, p2, len);
+
+    qptrdiff counter = 0;
+
+    // which one is easier to align, p1 or p2 ?
+    register int val1 = quintptr(p1) & 0xf;
+    register int val2 = quintptr(p2) & 0xf;
+    if (val1 && val2) {
+#if 0
+        // we'll align the one which requires the least number of steps
+        if (val1 > val2) {
+            qSwap(p1, p2);
+            val1 = val2;
+        }
+
+        // val1 contains the number of bytes past the 16-aligned mark
+        // we must read 16-val1 bytes to align
+        val1 = 16 - val1;
+        if (val1 & 0x2) {
+            if (*p1 != *p2)
+                return false;
+            --len;
+            ++counter;
+        }
+        while (val1 & 12) {
+            if (*(uint*)p1 != *(uint*)p2)
+                return false;
+            --len;
+            counter += 2;
+            val1 -= 4;
+        }
+#else
+        // we'll align the one closest to the 16-byte mark
+        if (val1 > val2) {
+            qSwap(p1, p2);
+            val1 = val2;
+        }
+
+        // we're reading val1 bytes too many
+        __m128i q2 = _mm_loadu_si128((__m128i *)(p2 - val1/2));
+        __m128i cmp = _mm_cmpeq_epi16(*(__m128i *)(p1 - val1/2), q2);
+        if (short(_mm_movemask_epi8(cmp)) >> val1 != short(-1))
+            return false;
+
+        counter = 8 - val1/2;
+        len -= 8 - val1/2;
+#endif
+    } else if (!val2) {
+        // p2 is already aligned
+        qSwap(p1, p2);
+    }
+
+    // p1 is aligned
+
+    while (len >= 8) {
+        __m128i q1 = _mm_load_si128((__m128i *)(p1 + counter));
+        __m128i q2 = _mm_loadu_si128((__m128i *)(p2 + counter));
+        __m128i cmp = _mm_cmpeq_epi16(q1, q2);
+        if (ushort(_mm_movemask_epi8(cmp)) != ushort(0xffff))
+            return false;
+
+        len -= 8;
+        counter += 8;
+    }
+
+    // tail
+    return equals2_short_tail(p1 + counter, p2 + counter, len);
+}
+
+#ifdef __SSE3__
+static bool equals2_sse3(const ushort *p1, const ushort *p2, int len)
+{
+    if (p1 == p2 || !len)
+        return true;
+
+    if (len >= 8) {
+        qptrdiff counter = 0;
+        while (len >= 8) {
+            __m128i q1 = _mm_lddqu_si128((__m128i *)(p1 + counter));
+            __m128i q2 = _mm_lddqu_si128((__m128i *)(p2 + counter));
+            __m128i cmp = _mm_cmpeq_epi16(q1, q2);
+            if (ushort(_mm_movemask_epi8(cmp)) != 0xffff)
+                return false;
+
+            len -= 8;
+            counter += 8;
+        }
+        p1 += counter;
+        p2 += counter;
+    }
+
+    return equals2_short_tail(p1, p2, len);
+}
+
+#ifdef __SSSE3__
+template<int N> static inline bool equals2_ssse3_alignr(__m128i *m1, __m128i *m2, int len)
+{
+    __m128i lower = _mm_load_si128(m1);
+    while (len >= 8) {
+        __m128i upper = _mm_load_si128(m1 + 1);
+        __m128i correct;
+        correct = _mm_alignr_epi8(upper, lower, N);
+
+        __m128i q2 = _mm_lddqu_si128(m2);
+        __m128i cmp = _mm_cmpeq_epi16(correct, q2);
+        if (ushort(_mm_movemask_epi8(cmp)) != 0xffff)
+            return false;
+
+        len -= 8;
+        ++m2;
+        ++m1;
+        lower = upper;
+    }
+
+    // tail
+    return len == 0 || equals2_short_tail((const ushort *)m1 + N / 2, (const ushort*)m2, len);
+}
+
+static inline bool equals2_ssse3_aligned(__m128i *m1, __m128i *m2, int len)
+{
+    while (len >= 8) {
+        __m128i q2 = _mm_lddqu_si128(m2);
+        __m128i cmp = _mm_cmpeq_epi16(*m1, q2);
+        if (ushort(_mm_movemask_epi8(cmp)) != 0xffff)
+            return false;
+
+        len -= 8;
+        ++m1;
+        ++m2;
+    }
+    return len == 0 || equals2_short_tail((const ushort *)m1, (const ushort *)m2, len);
+}
+
+static bool equals2_ssse3(const ushort *p1, const ushort *p2, int len)
+{
+    // p1 & 0xf can be:
+    //   0,  2,  4,  6,  8, 10, 12, 14
+    // If it's 0, we're aligned
+    // If it's not, then we're interested in the 16 - (p1 & 0xf) bytes only
+
+    if (len >= 8) {
+        // find the last aligned position below the p1 memory
+        __m128i *m1 = (__m128i *)(quintptr(p1) & ~0xf);
+        __m128i *m2 = (__m128i *)p2;
+        qptrdiff diff = quintptr(p1) - quintptr(m1);
+
+        // diff contains the number of extra bytes
+        if (diff == 10)
+            return equals2_ssse3_alignr<10>(m1, m2, len);
+        else if (diff == 2)
+            return equals2_ssse3_alignr<2>(m1, m2, len);
+        if (diff < 8) {
+            if (diff < 4) {
+                return equals2_ssse3_aligned(m1, m2, len);
+            } else {
+                if (diff == 4)
+                    return equals2_ssse3_alignr<4>(m1, m2, len);
+                else // diff == 6
+                    return equals2_ssse3_alignr<6>(m1, m2, len);
+            }
+        } else {
+            if (diff < 12) {
+                return equals2_ssse3_alignr<8>(m1, m2, len);
+            } else {
+                if (diff == 12)
+                    return equals2_ssse3_alignr<12>(m1, m2, len);
+                else // diff == 14
+                    return equals2_ssse3_alignr<14>(m1, m2, len);
+            }
+        }
+    }
+
+    // tail
+    return equals2_short_tail(p1, p2, len);
+}
+
+template<int N> static inline bool equals2_ssse3_aligning_alignr(__m128i *m1, __m128i *m2, int len)
+{
+    __m128i lower = _mm_load_si128(m1);
+    while (len >= 8) {
+        __m128i upper = _mm_load_si128(m1 + 1);
+        __m128i correct;
+        correct = _mm_alignr_epi8(upper, lower, N);
+
+        __m128i cmp = _mm_cmpeq_epi16(correct, *m2);
+        if (ushort(_mm_movemask_epi8(cmp)) != 0xffff)
+            return false;
+
+        len -= 8;
+        ++m2;
+        ++m1;
+        lower = upper;
+    }
+
+    // tail
+    return len == 0 || equals2_short_tail((const ushort *)m1 + N / 2, (const ushort*)m2, len);
+}
+
+static bool equals2_ssse3_aligning(const ushort *p1, const ushort *p2, int len)
+{
+    if (len < 8)
+        return equals2_short_tail(p1, p2, len);
+    qptrdiff counter = 0;
+
+    // which one is easier to align, p1 or p2 ?
+    {
+        register int val1 = quintptr(p1) & 0xf;
+        register int val2 = quintptr(p2) & 0xf;
+        if (val1 && val2) {
+            // we'll align the one closest to the 16-byte mark
+            if (val1 < val2) {
+                qSwap(p1, p2);
+                val2 = val1;
+            }
+
+            // we're reading val1 bytes too many
+            __m128i q1 = _mm_lddqu_si128((__m128i *)(p1 - val2/2));
+            __m128i cmp = _mm_cmpeq_epi16(q1, *(__m128i *)(p2 - val2/2));
+            if (short(_mm_movemask_epi8(cmp)) >> val1 != short(-1))
+                return false;
+
+            counter = 8 - val2/2;
+            len -= 8 - val2/2;
+        } else if (!val1) {
+            // p1 is already aligned
+            qSwap(p1, p2);
+        }
+    }
+
+    // p2 is aligned now
+    // we want to use palignr in the mis-alignment of p1
+    __m128i *m1 = (__m128i *)(quintptr(p1 + counter) & ~0xf);
+    __m128i *m2 = (__m128i *)(p2 + counter);
+    register int val1 = quintptr(p1 + counter) - quintptr(m1);
+
+    // val1 contains the number of extra bytes
+    if (val1 == 8)
+        return equals2_ssse3_aligning_alignr<8>(m1, m2, len);
+    if (val1 == 0)
+        return equals2_sse2_aligned(p1 + counter, p2 + counter, len);
+    if (val1 < 8) {
+        if (val1 < 4) {
+            return equals2_ssse3_aligning_alignr<2>(m1, m2, len);
+        } else {
+            if (val1 == 4)
+                return equals2_ssse3_aligning_alignr<4>(m1, m2, len);
+            else // diff == 6
+                return equals2_ssse3_aligning_alignr<6>(m1, m2, len);
+        }
+    } else {
+        if (val1 < 12) {
+            return equals2_ssse3_aligning_alignr<10>(m1, m2, len);
+        } else {
+            if (val1 == 12)
+                return equals2_ssse3_aligning_alignr<12>(m1, m2, len);
+            else // diff == 14
+                return equals2_ssse3_aligning_alignr<14>(m1, m2, len);
+        }
+    }
+}
+
+#ifdef __SSE4_1__
+static bool equals2_sse4(const ushort *p1, const ushort *p2, int len)
+{
+    // We use the pcmpestrm instruction searching for differences (negative polarity)
+    // it will reset CF if it's all equal
+    // it will reset OF if the first char is equal
+    // it will set ZF & SF if the length is less than 8 (which means we've done the last operation)
+    // the three possible conditions are:
+    //  difference found:         CF = 1
+    //  all equal, not finished:  CF = ZF = SF = 0
+    //  all equal, finished:      CF = 0, ZF = SF = 1
+    // We use the JA instruction that jumps if ZF = 0 and CF = 0
+    if (p1 == p2 || !len)
+        return true;
+
+    // This function may read some bytes past the end of p1 or p2
+    // It is safe to do that, as long as those extra bytes (beyond p1+len and p2+len)
+    // are on the same page as the last valid byte.
+    // If len is a multiple of 8, we'll never load invalid bytes.
+    if (len & 7) {
+        // The last load would load (len & ~7) valid bytes and (8 - (len & ~7)) invalid bytes.
+        // So we can't do the last load if any of those bytes is in a different
+        // page. That is, if:
+        //    pX + len      is on a different page from     pX + (len & ~7) + 8
+        //
+        // that is, if second-to-last load ended up less than 16 bytes from the page end:
+        //    pX + (len & ~7)  is the last ushort read in the second-to-last load
+        if (len < 8)
+            return equals2_short_tail(p1, p2, len);
+        if ((quintptr(p1 + (len & ~7)) & 0xfff) > 0xff0 ||
+                (quintptr(p2 + (len & ~7)) & 0xfff) > 0xff0) {
+
+            // yes, so we mustn't do the final 128-bit load
+            bool result;
+            asm (
+            "sub        %[p1], %[p2]\n\t"
+            "sub        $16, %[p1]\n\t"
+            "add        $8, %[len]\n\t"
+
+            // main loop:
+            "0:\n\t"
+            "add        $16, %[p1]\n\t"
+            "sub        $8, %[len]\n\t"
+            "jz         1f\n\t"
+            "lddqu      (%[p1]), %%xmm0\n\t"
+            "mov        %[len], %%edx\n\t"
+            "pcmpestri  %[mode], (%[p2],%[p1]), %%xmm0\n\t"
+
+            "jna        1f\n\t"
+            "add        $16, %[p1]\n\t"
+            "sub        $8, %[len]\n\t"
+            "jz         1f\n\t"
+            "lddqu      (%[p1]), %%xmm0\n\t"
+            "mov        %[len], %%edx\n\t"
+            "pcmpestri  %[mode], (%[p2],%[p1]), %%xmm0\n\t"
+
+            "ja         0b\n\t"
+            "1:\n\t"
+            "setnc      %[result]\n\t"
+            : [result] "=a" (result),
+              [p1] "+r" (p1),
+              [p2] "+r" (p2)
+            : [len] "0" (len & ~7),
+              [mode] "i" (_SIDD_UWORD_OPS | _SIDD_CMP_EQUAL_EACH | _SIDD_NEGATIVE_POLARITY)
+            : "%edx", "%ecx", "%xmm0"
+            );
+            return result && equals2_short_tail(p1, (const ushort *)(quintptr(p1) + quintptr(p2)), len & 7);
+        }
+    }
+
+//    const qptrdiff disp = p2 - p1;
+//    p1 -= 8;
+//    len += 8;
+//    while (true) {
+//        enum { Mode = _SIDD_UWORD_OPS | _SIDD_CMP_EQUAL_EACH | _SIDD_NEGATIVE_POLARITY };
+
+//        p1 += 8;
+//        len -= 8;
+//        if (!len)
+//            return true;
+
+//        __m128i q1 = _mm_lddqu_si128((__m128i *)(p1 + disp));
+//        __m128i *m2 = (__m128i *)p1;
+
+//        bool cmp_a = _mm_cmpestra(q1, len, *m2, len, Mode);
+//        if (cmp_a)
+//            continue;
+//        return !_mm_cmpestrc(q1, len, *m2, len, Mode);
+//    }
+//    return true;
+    bool result;
+    asm (
+        "sub        %[p1], %[p2]\n\t"
+        "sub        $16, %[p1]\n\t"
+        "add        $8, %[len]\n\t"
+
+    "0:\n\t"
+        "add        $16, %[p1]\n\t"
+        "sub        $8, %[len]\n\t"
+        "jz         1f\n\t"
+        "lddqu      (%[p2],%[p1]), %%xmm0\n\t"
+        "mov        %[len], %%edx\n\t"
+        "pcmpestri  %[mode], (%[p1]), %%xmm0\n\t"
+
+        "jna        1f\n\t"
+        "add        $16, %[p1]\n\t"
+        "sub        $8, %[len]\n\t"
+        "jz         1f\n\t"
+        "lddqu      (%[p2],%[p1]), %%xmm0\n\t"
+        "mov        %[len], %%edx\n\t"
+        "pcmpestri  %[mode], (%[p1]), %%xmm0\n\t"
+
+        "ja         0b\n\t"
+
+    "1:\n\t"
+        "setnc      %[result]\n\t"
+        : [result] "=a" (result)
+        : [len] "0" (len),
+          [p1] "r" (p1),
+          [p2] "r" (p2),
+          [mode] "i" (_SIDD_UWORD_OPS | _SIDD_CMP_EQUAL_EACH | _SIDD_NEGATIVE_POLARITY)
+        : "%edx", "%ecx", "%xmm0"
+    );
+    return result;
+}
+
+#endif
+#endif
+#endif
+#endif
+
+typedef bool (* FuncPtr)(const ushort *, const ushort *, int);
+static const FuncPtr func[] = {
+    equals2_memcmp_call, // 0
+    equals2_bytewise, // 1
+    equals2_shortwise, // 1
+    equals2_intwise, // 3
+#ifdef __SSE2__
+    equals2_sse2, // 4
+    equals2_sse2_aligning, // 5
+#ifdef __SSE3__
+    equals2_sse3, // 6
+#ifdef __SSSE3__
+    equals2_ssse3, // 7
+    equals2_ssse3, // 8
+#ifdef __SSE4_1__
+    equals2_sse4, // 9
+#endif
+#endif
+#endif
+#endif
+    0
+};
+static const int functionCount = sizeof(func)/sizeof(func[0]) - 1;
+
+void tst_QString::equals2_data() const
+{
+    QTest::addColumn<int>("algorithm");
+    QTest::newRow("selftest") << -1;
+    QTest::newRow("memcmp_call") << 0;
+    QTest::newRow("bytewise") << 1;
+    QTest::newRow("shortwise") << 2;
+    QTest::newRow("intwise") << 3;
+#ifdef __SSE2__
+    QTest::newRow("sse2") << 4;
+    QTest::newRow("sse2_aligning") << 5;
+#ifdef __SSE3__
+    QTest::newRow("sse3") << 6;
+#ifdef __SSSE3__
+    QTest::newRow("ssse3") << 7;
+    QTest::newRow("ssse3_aligning") << 8;
+#ifdef __SSE4_1__
+    QTest::newRow("sse4.2") << 9;
+#endif
+#endif
+#endif
+#endif
+}
+
+static void __attribute__((noinline)) equals2_selftest()
+{
+#ifdef Q_OS_UNIX
+    const long pagesize = sysconf(_SC_PAGESIZE);
+    void *page1, *page3;
+    ushort *page2;
+    page1 = mmap(0, pagesize, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+    page2 = (ushort *)mmap(0, pagesize, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS | MAP_POPULATE, -1, 0);
+    page3 = mmap(0, pagesize, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+
+    Q_ASSERT(quintptr(page2) == quintptr(page1) + pagesize || quintptr(page2) == quintptr(page1) - pagesize);
+    Q_ASSERT(quintptr(page3) == quintptr(page2) + pagesize || quintptr(page3) == quintptr(page2) - pagesize);
+    munmap(page1, pagesize);
+    munmap(page3, pagesize);
+
+    // populate our page
+    for (uint i = 0; i < pagesize / sizeof(long long); ++i)
+        ((long long *)page2)[i] = Q_INT64_C(0x0041004100410041);
+
+    // the following should crash:
+    //page2[-1] = 0xdead;
+    //page2[pagesize / sizeof(ushort) + 1] = 0xbeef;
+
+    static const ushort needle[] = {
+        0x41, 0x41, 0x41, 0x41,   0x41, 0x41, 0x41, 0x41,
+        0x41, 0x41, 0x41, 0x41,   0x41, 0x41, 0x41, 0x41,
+        0x41
+    };
+
+    for (int algo = 0; algo < functionCount; ++algo) {
+        // boundary condition test:
+        for (int i = 0; i < 8; ++i) {
+            (func[algo])(page2 + i, needle, sizeof needle / 2);
+            (func[algo])(page2 - i - 1 - sizeof(needle)/2 + pagesize/2, needle, sizeof needle/2);
+        }
+    }
+
+    munmap(page2, pagesize);
+#endif
+
+    for (int algo = 0; algo < functionCount; ++algo) {
+        for (int i = 0; i < stringCollectionCount; ++i) {
+            const ushort *p1 = stringCollectionData + stringCollection[i].offset1;
+            const ushort *p2 = stringCollectionData + stringCollection[i].offset2;
+            bool expected = memcmp(p1, p2, stringCollection[i].len * 2) == 0;
+
+            bool result = (func[algo])(p1, p2, stringCollection[i].len);
+            if (expected != result)
+                qWarning().nospace()
+                        << "algo=" << algo
+                        << " i=" << i
+                        << " failed (" << result << "!=" << expected
+                        << "); strings were "
+                        << QByteArray((char*)p1, stringCollection[i].len).toHex()
+                        << " and "
+                        << QByteArray((char*)p2, stringCollection[i].len).toHex();
+        }
+    }
+}
+
+void tst_QString::equals2() const
+{
+    QFETCH(int, algorithm);
+    if (algorithm == -1) {
+        equals2_selftest();
+        return;
+    }
+
+    QBENCHMARK {
+        for (int i = 0; i < stringCollectionCount; ++i) {
+            const ushort *p1 = stringCollectionData + stringCollection[i].offset1;
+            const ushort *p2 = stringCollectionData + stringCollection[i].offset2;
+            bool result = (func[algorithm])(p1, p2, stringCollection[i].len);
+            Q_UNUSED(result);
+        }
+    }
+}
+
+static int ucstrncmp_shortwise(const ushort *a, const ushort *b, int l)
+{
+    while (l-- && *a == *b)
+        a++,b++;
+    if (l==-1)
+        return 0;
+    return *a - *b;
+}
+
+static int ucstrncmp_intwise(const ushort *a, const ushort *b, int len)
+{
+    // do both strings have the same alignment?
+    if ((quintptr(a) & 2) == (quintptr(b) & 2)) {
+        // are we aligned to 4 bytes?
+        if (quintptr(a) & 2) {
+            if (*a != *b)
+                return *a - *b;
+            ++a;
+            ++b;
+            --len;
+        }
+
+        const uint *p1 = (const uint *)a;
+        const uint *p2 = (const uint *)b;
+        quintptr counter = 0;
+        for ( ; len > 1 ; len -= 2, ++counter) {
+            if (p1[counter] != p2[counter]) {
+                // which ushort isn't equal?
+                int diff = a[2*counter] - b[2*counter];
+                return diff ? diff : a[2*counter + 1] - b[2*counter + 1];
+            }
+        }
+
+        return len ? a[2*counter] - b[2*counter] : 0;
+    } else {
+        while (len-- && *a == *b)
+            a++,b++;
+        if (len==-1)
+            return 0;
+        return *a - *b;
+    }
+}
+
+#ifdef __SSE2__
+static inline int ucstrncmp_short_tail(const ushort *p1, const ushort *p2, int len)
+{
+    if (len) {
+        if (*p1 != *p2)
+            return *p1 - *p2;
+        if (--len) {
+            if (p1[1] != p2[1])
+                return p1[1] - p2[1];
+            if (--len) {
+                if (p1[2] != p2[2])
+                    return p1[2] - p2[2];
+                if (--len) {
+                    if (p1[3] != p2[3])
+                        return p1[3] - p2[3];
+                    if (--len) {
+                        if (p1[4] != p2[4])
+                            return p1[4] - p2[4];
+                        if (--len) {
+                            if (p1[5] != p2[5])
+                                return p1[5] - p2[5];
+                            if (--len) {
+                                if (p1[6] != p2[6])
+                                    return p1[6] - p2[6];
+                                return p1[7] - p2[7];
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+    return 0;
+}
+
+static inline int bsf_nonzero(register long val)
+{
+    int result;
+# ifdef Q_CC_GNU
+    // returns the first non-zero bit on a non-zero reg
+    asm ("bsf   %1, %0" : "=r" (result) : "r" (val));
+    return result;
+# elif defined(Q_CC_MSVC)
+    _BitScanForward(&result, val);
+    return result;
+# endif
+}
+
+static int ucstrncmp_sse2(const ushort *a, const ushort *b, int len)
+{
+    qptrdiff counter = 0;
+    while (len >= 8) {
+        __m128i m1 = _mm_loadu_si128((__m128i *)(a + counter));
+        __m128i m2 = _mm_loadu_si128((__m128i *)(b + counter));
+        __m128i cmp = _mm_cmpeq_epi16(m1, m2);
+        ushort mask = ~uint(_mm_movemask_epi8(cmp));
+        if (mask) {
+            // which ushort isn't equal?
+            counter += bsf_nonzero(mask)/2;
+            return a[counter] - b[counter];
+        }
+
+        counter += 8;
+        len -= 8;
+    }
+    return ucstrncmp_short_tail(a + counter, b + counter, len);
+}
+
+static int ucstrncmp_sse2_aligning(const ushort *a, const ushort *b, int len)
+{
+    if (len >= 8) {
+        __m128i m1 = _mm_loadu_si128((__m128i *)a);
+        __m128i m2 = _mm_loadu_si128((__m128i *)b);
+        __m128i cmp = _mm_cmpeq_epi16(m1, m2);
+        ushort mask = ~uint(_mm_movemask_epi8(cmp));
+        if (mask) {
+            // which ushort isn't equal?
+            int counter = bsf_nonzero(mask)/2;
+            return a[counter] - b[counter];
+        }
+
+
+        // now align to do 16-byte loads
+        int diff = 8 - (quintptr(a) & 0xf)/2;
+        len -= diff;
+        a += diff;
+        b += diff;
+    }
+
+    qptrdiff counter = 0;
+    while (len >= 8) {
+        __m128i m1 = _mm_load_si128((__m128i *)(a + counter));
+        __m128i m2 = _mm_loadu_si128((__m128i *)(b + counter));
+        __m128i cmp = _mm_cmpeq_epi16(m1, m2);
+        ushort mask = ~uint(_mm_movemask_epi8(cmp));
+        if (mask) {
+            // which ushort isn't equal?
+            counter += bsf_nonzero(mask)/2;
+            return a[counter] - b[counter];
+        }
+
+        counter += 8;
+        len -= 8;
+    }
+    return ucstrncmp_short_tail(a + counter, b + counter, len);
+}
+
+static inline int ucstrncmp_sse2_aligned(const ushort *a, const ushort *b, int len)
+{
+    quintptr counter = 0;
+    while (len >= 8) {
+        __m128i m1 = _mm_load_si128((__m128i *)(a + counter));
+        __m128i m2 = _mm_load_si128((__m128i *)(b + counter));
+        __m128i cmp = _mm_cmpeq_epi16(m1, m2);
+        ushort mask = ~uint(_mm_movemask_epi8(cmp));
+        if (mask) {
+            // which ushort isn't equal?
+            counter += bsf_nonzero(mask)/2;
+            return a[counter] - b[counter];
+        }
+
+        counter += 8;
+        len -= 8;
+    }
+    return ucstrncmp_short_tail(a + counter, b + counter, len);
+}
+
+#ifdef __SSSE3__
+static inline int ucstrncmp_ssse3_alignr_aligned(const ushort *a, const ushort *b, int len)
+{
+    quintptr counter = 0;
+    while (len >= 8) {
+        __m128i m1 = _mm_load_si128((__m128i *)(a + counter));
+        __m128i m2 = _mm_lddqu_si128((__m128i *)(b + counter));
+        __m128i cmp = _mm_cmpeq_epi16(m1, m2);
+        ushort mask = ~uint(_mm_movemask_epi8(cmp));
+        if (mask) {
+            // which ushort isn't equal?
+            counter += bsf_nonzero(mask)/2;
+            return a[counter] - b[counter];
+        }
+
+        counter += 8;
+        len -= 8;
+    }
+    return ucstrncmp_short_tail(a + counter, b + counter, len);
+}
+
+
+typedef __m128i (* MMLoadFunction)(const __m128i *);
+template<int N, MMLoadFunction LoadFunction>
+static inline int ucstrncmp_ssse3_alignr(const ushort *a, const ushort *b, int len)
+{
+    qptrdiff counter = 0;
+    __m128i lower, upper;
+    upper = _mm_load_si128((__m128i *)a);
+
+    do {
+        lower = upper;
+        upper = _mm_load_si128((__m128i *)(a + counter) + 1);
+        __m128i merged = _mm_alignr_epi8(upper, lower, N);
+
+        __m128i m2 = LoadFunction((__m128i *)(b + counter));
+        __m128i cmp = _mm_cmpeq_epi16(merged, m2);
+        ushort mask = ~uint(_mm_movemask_epi8(cmp));
+        if (mask) {
+            // which ushort isn't equal?
+            counter += bsf_nonzero(mask)/2;
+            return a[counter + N/2] - b[counter];
+        }
+
+        counter += 8;
+        len -= 8;
+    } while (len >= 8);
+
+    return ucstrncmp_short_tail(a + counter + N/2, b + counter, len);
+}
+
+static int ucstrncmp_ssse3(const ushort *a, const ushort *b, int len)
+{
+    if (len >= 8) {
+        int val = quintptr(a) & 0xf;
+        a -= val/2;
+
+        if (val == 10)
+            return ucstrncmp_ssse3_alignr<10, _mm_lddqu_si128>(a, b, len);
+        else if (val == 2)
+            return ucstrncmp_ssse3_alignr<2, _mm_lddqu_si128>(a, b, len);
+        if (val < 8) {
+            if (val < 4)
+                return ucstrncmp_ssse3_alignr_aligned(a, b, len);
+            else if (val == 4)
+                    return ucstrncmp_ssse3_alignr<4, _mm_lddqu_si128>(a, b, len);
+            else
+                    return ucstrncmp_ssse3_alignr<6, _mm_lddqu_si128>(a, b, len);
+        } else {
+            if (val < 12)
+                return ucstrncmp_ssse3_alignr<8, _mm_lddqu_si128>(a, b, len);
+            else if (val == 12)
+                return ucstrncmp_ssse3_alignr<12, _mm_lddqu_si128>(a, b, len);
+            else
+                return ucstrncmp_ssse3_alignr<14, _mm_lddqu_si128>(a, b, len);
+        }
+    }
+    return ucstrncmp_short_tail(a, b, len);
+}
+
+static int ucstrncmp_ssse3_aligning(const ushort *a, const ushort *b, int len)
+{
+    if (len >= 8) {
+        __m128i m1 = _mm_loadu_si128((__m128i *)a);
+        __m128i m2 = _mm_loadu_si128((__m128i *)b);
+        __m128i cmp = _mm_cmpeq_epi16(m1, m2);
+        ushort mask = ~uint(_mm_movemask_epi8(cmp));
+        if (mask) {
+            // which ushort isn't equal?
+            int counter = bsf_nonzero(mask)/2;
+            return a[counter] - b[counter];
+        }
+
+
+        // now 'b' align to do 16-byte loads
+        int diff = 8 - (quintptr(b) & 0xf)/2;
+        len -= diff;
+        a += diff;
+        b += diff;
+    }
+
+    if (len < 8)
+        return ucstrncmp_short_tail(a, b, len);
+
+    // 'b' is aligned
+    int val = quintptr(a) & 0xf;
+    a -= val/2;
+
+    if (val == 8)
+        return ucstrncmp_ssse3_alignr<8, _mm_load_si128>(a, b, len);
+    else if (val == 0)
+        return ucstrncmp_sse2_aligned(a, b, len);
+    if (val < 8) {
+        if (val < 4)
+            return ucstrncmp_ssse3_alignr<2, _mm_load_si128>(a, b, len);
+        else if (val == 4)
+            return ucstrncmp_ssse3_alignr<4, _mm_load_si128>(a, b, len);
+        else
+            return ucstrncmp_ssse3_alignr<6, _mm_load_si128>(a, b, len);
+    } else {
+        if (val < 12)
+            return ucstrncmp_ssse3_alignr<10, _mm_load_si128>(a, b, len);
+        else if (val == 12)
+            return ucstrncmp_ssse3_alignr<12, _mm_load_si128>(a, b, len);
+        else
+            return ucstrncmp_ssse3_alignr<14, _mm_load_si128>(a, b, len);
+    }
+}
+
+static inline
+int ucstrncmp_ssse3_aligning2_aligned(const ushort *a, const ushort *b, int len, int garbage)
+{
+    // len >= 8
+    __m128i m1 = _mm_load_si128((const __m128i *)a);
+    __m128i m2 = _mm_load_si128((const __m128i *)b);
+    __m128i cmp = _mm_cmpeq_epi16(m1, m2);
+    int mask = short(_mm_movemask_epi8(cmp)); // force sign extension
+    mask >>= garbage;
+    if (~mask) {
+        // which ushort isn't equal?
+        uint counter = (garbage + bsf_nonzero(~mask));
+        return a[counter/2] - b[counter/2];
+    }
+
+    // the first 16-garbage bytes (8-garbage/2 ushorts) were equal
+    len -= 8 - garbage/2;
+    return ucstrncmp_sse2_aligned(a + 8, b + 8, len);
+}
+
+template<int N> static inline
+int ucstrncmp_ssse3_aligning2_alignr(const ushort *a, const ushort *b, int len, int garbage)
+{
+    // len >= 8
+    __m128i lower, upper, merged;
+    lower = _mm_load_si128((const __m128i*)a);
+    upper = _mm_load_si128((const __m128i*)(a + 8));
+    merged = _mm_alignr_epi8(upper, lower, N);
+
+    __m128i m2 = _mm_load_si128((const __m128i*)b);
+    __m128i cmp = _mm_cmpeq_epi16(merged, m2);
+    int mask = short(_mm_movemask_epi8(cmp)); // force sign extension
+    mask >>= garbage;
+    if (~mask) {
+        // which ushort isn't equal?
+        uint counter = (garbage + bsf_nonzero(~mask));
+        return a[counter/2 + N/2] - b[counter/2];
+    }
+
+    // the first 16-garbage bytes (8-garbage/2 ushorts) were equal
+    quintptr counter = 8;
+    len -= 8 - garbage/2;
+    while (len >= 8) {
+        lower = upper;
+        upper = _mm_load_si128((__m128i *)(a + counter) + 1);
+        merged = _mm_alignr_epi8(upper, lower, N);
+
+        m2 = _mm_load_si128((__m128i *)(b + counter));
+        cmp = _mm_cmpeq_epi16(merged, m2);
+        ushort mask = ~uint(_mm_movemask_epi8(cmp));
+        if (mask) {
+            // which ushort isn't equal?
+            counter += bsf_nonzero(mask)/2;
+            return a[counter + N/2] - b[counter];
+        }
+
+        counter += 8;
+        len -= 8;
+    }
+
+    return ucstrncmp_short_tail(a + counter + N/2, b + counter, len);
+}
+
+static inline int conditional_invert(int result, bool invert)
+{
+    if (invert)
+        return -result;
+    return result;
+}
+
+static int ucstrncmp_ssse3_aligning2(const ushort *a, const ushort *b, int len)
+{
+    // Different strategy from above: instead of doing two unaligned loads
+    // when trying to align, we'll only do aligned loads and round down the
+    // addresses of a and b. This means the first load will contain garbage
+    // in the beginning of the string, which we'll shift out of the way
+    // (after _mm_movemask_epi8)
+
+    if (len < 8)
+        return ucstrncmp_intwise(a, b, len);
+
+    // both a and b are misaligned
+    // we'll call the alignr function with the alignment *difference* between the two
+    int offset = (quintptr(a) & 0xf) - (quintptr(b) & 0xf);
+    if (offset >= 0) {
+        // from this point on, b has the shortest alignment
+        // and align(a) = align(b) + offset
+        // round down the alignment so align(b) == align(a) == 0
+        int garbage = (quintptr(b) & 0xf);
+        a = (const ushort*)(quintptr(a) & ~0xf);
+        b = (const ushort*)(quintptr(b) & ~0xf);
+
+        // now the first load of b will load 'garbage' extra bytes
+        // and the first load of a will load 'garbage + offset' extra bytes
+        if (offset == 8)
+            return ucstrncmp_ssse3_aligning2_alignr<8>(a, b, len, garbage);
+        if (offset == 0)
+            return ucstrncmp_ssse3_aligning2_aligned(a, b, len, garbage);
+        if (offset < 8) {
+            if (offset < 4)
+                return ucstrncmp_ssse3_aligning2_alignr<2>(a, b, len, garbage);
+            else if (offset == 4)
+                return ucstrncmp_ssse3_aligning2_alignr<4>(a, b, len, garbage);
+            else
+                return ucstrncmp_ssse3_aligning2_alignr<6>(a, b, len, garbage);
+        } else {
+            if (offset < 12)
+                return ucstrncmp_ssse3_aligning2_alignr<10>(a, b, len, garbage);
+            else if (offset == 12)
+                return ucstrncmp_ssse3_aligning2_alignr<12>(a, b, len, garbage);
+            else
+                return ucstrncmp_ssse3_aligning2_alignr<14>(a, b, len, garbage);
+        }
+    } else {
+        // same as above but inverted
+        int garbage = (quintptr(a) & 0xf);
+        a = (const ushort*)(quintptr(a) & ~0xf);
+        b = (const ushort*)(quintptr(b) & ~0xf);
+
+        offset = -offset;
+        if (offset == 8)
+            return -ucstrncmp_ssse3_aligning2_alignr<8>(b, a, len, garbage);
+        if (offset < 8) {
+            if (offset < 4)
+                return -ucstrncmp_ssse3_aligning2_alignr<2>(b, a, len, garbage);
+            else if (offset == 4)
+                return -ucstrncmp_ssse3_aligning2_alignr<4>(b, a, len, garbage);
+            else
+                return -ucstrncmp_ssse3_aligning2_alignr<6>(b, a, len, garbage);
+        } else {
+            if (offset < 12)
+                return -ucstrncmp_ssse3_aligning2_alignr<10>(b, a, len, garbage);
+            else if (offset == 12)
+                return -ucstrncmp_ssse3_aligning2_alignr<12>(b, a, len, garbage);
+            else
+                return -ucstrncmp_ssse3_aligning2_alignr<14>(b, a, len, garbage);
+        }
+    }
+}
+
+#endif
+#endif
+
+typedef int (* UcstrncmpFunction)(const ushort *, const ushort *, int);
+Q_DECLARE_METATYPE(UcstrncmpFunction)
+
+void tst_QString::ucstrncmp_data() const
+{
+    QTest::addColumn<UcstrncmpFunction>("function");
+    QTest::newRow("selftest") << UcstrncmpFunction(0);
+    QTest::newRow("shortwise") << &ucstrncmp_shortwise;
+    QTest::newRow("intwise") << &ucstrncmp_intwise;
+#ifdef __SSE2__
+    QTest::newRow("sse2") << &ucstrncmp_sse2;
+    QTest::newRow("sse2_aligning") << &ucstrncmp_sse2_aligning;
+#ifdef __SSSE3__
+    QTest::newRow("ssse3") << &ucstrncmp_ssse3;
+    QTest::newRow("ssse3_aligning") << &ucstrncmp_ssse3_aligning;
+    QTest::newRow("ssse3_aligning2") << &ucstrncmp_ssse3_aligning2;
+#endif
+#endif
+}
+
+void tst_QString::ucstrncmp() const
+{
+    QFETCH(UcstrncmpFunction, function);
+    if (!function) {
+        static const UcstrncmpFunction func[] = {
+            &ucstrncmp_shortwise,
+            &ucstrncmp_intwise,
+#ifdef __SSE2__
+            &ucstrncmp_sse2,
+            &ucstrncmp_sse2_aligning,
+#ifdef __SSSE3__
+            &ucstrncmp_ssse3,
+            &ucstrncmp_ssse3_aligning,
+            &ucstrncmp_ssse3_aligning2
+#endif
+#endif
+        };
+        static const int functionCount = sizeof func / sizeof func[0];
+
+#ifdef Q_OS_UNIX
+        const long pagesize = sysconf(_SC_PAGESIZE);
+        void *page1, *page3;
+        ushort *page2;
+        page1 = mmap(0, pagesize, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+        page2 = (ushort *)mmap(0, pagesize, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS | MAP_POPULATE, -1, 0);
+        page3 = mmap(0, pagesize, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+
+        Q_ASSERT(quintptr(page2) == quintptr(page1) + pagesize || quintptr(page2) == quintptr(page1) - pagesize);
+        Q_ASSERT(quintptr(page3) == quintptr(page2) + pagesize || quintptr(page3) == quintptr(page2) - pagesize);
+        munmap(page1, pagesize);
+        munmap(page3, pagesize);
+
+        // populate our page
+        for (uint i = 0; i < pagesize / sizeof(long long); ++i)
+            ((long long *)page2)[i] = Q_INT64_C(0x0041004100410041);
+
+        // the following should crash:
+        //page2[-1] = 0xdead;
+        //page2[pagesize / sizeof(ushort) + 1] = 0xbeef;
+
+        static const ushort needle[] = {
+            0x41, 0x41, 0x41, 0x41,   0x41, 0x41, 0x41, 0x41,
+            0x41, 0x41, 0x41, 0x41,   0x41, 0x41, 0x41, 0x41,
+            0x41
+        };
+
+        for (int algo = 0; algo < functionCount; ++algo) {
+            // boundary condition test:
+            for (int i = 0; i < 8; ++i) {
+                (func[algo])(page2 + i, needle, sizeof needle / 2);
+                (func[algo])(page2 - i - 1 - sizeof(needle)/2 + pagesize/2, needle, sizeof needle/2);
+            }
+        }
+
+        munmap(page2, pagesize);
+#endif
+
+        for (int algo = 0; algo < functionCount; ++algo) {
+            for (int i = 0; i < stringCollectionCount; ++i) {
+                const ushort *p1 = stringCollectionData + stringCollection[i].offset1;
+                const ushort *p2 = stringCollectionData + stringCollection[i].offset2;
+                int expected = ucstrncmp_shortwise(p1, p2, stringCollection[i].len);
+                expected = qBound(-1, expected, 1);
+
+                int result = (func[algo])(p1, p2, stringCollection[i].len);
+                result = qBound(-1, result, 1);
+                if (expected != result)
+                    qWarning().nospace()
+                        << "algo=" << algo
+                        << " i=" << i
+                        << " failed (" << result << "!=" << expected
+                        << "); strings were "
+                        << QByteArray((char*)p1, stringCollection[i].len).toHex()
+                        << " and "
+                        << QByteArray((char*)p2, stringCollection[i].len).toHex();
+            }
+        }
+        return;
+    }
+
+    QBENCHMARK {
+        for (int i = 0; i < stringCollectionCount; ++i) {
+            const ushort *p1 = stringCollectionData + stringCollection[i].offset1;
+            const ushort *p2 = stringCollectionData + stringCollection[i].offset2;
+            (function)(p1, p2, stringCollection[i].len);
+        }
+    }
+}
+
+void tst_QString::fromUtf8() const
+{
+    QFile file(SRCDIR "utf-8.txt");
+    if (!file.open(QFile::ReadOnly)) {
+        qFatal("Cannot open input file");
+        return;
+    }
+    QByteArray data = file.readAll();
+    const char *d = data.constData();
+    int size = data.size();
+
+    QBENCHMARK {
+        QString::fromUtf8(d, size);
+    }
+}
+
+void tst_QString::fromLatin1_data() const
+{
+    QTest::addColumn<QByteArray>("latin1");
+
+    // make all the strings have the same length
+    QTest::newRow("ascii-only") << QByteArray("HelloWorld");
+    QTest::newRow("ascii+control") << QByteArray("Hello\1\r\n\x7f\t");
+    QTest::newRow("ascii+nul") << QByteArray("a\0zbc\0defg", 10);
+    QTest::newRow("non-ascii") << QByteArray("\x80\xc0\xff\x81\xc1\xfe\x90\xd0\xef\xa0");
+}
+
+void tst_QString::fromLatin1() const
+{
+    QFETCH(QByteArray, latin1);
+
+    while (latin1.length() < 128) {
+        latin1 += latin1;
+    }
+
+    QByteArray copy1 = latin1, copy2 = latin1, copy3 = latin1;
+    copy1.chop(1);
+    copy2.detach();
+    copy3 += latin1; // longer length
+    copy2.clear();
+
+    QBENCHMARK {
+        QString s1 = QString::fromLatin1(latin1);
+        QString s2 = QString::fromLatin1(latin1);
+        QString s3 = QString::fromLatin1(copy1);
+        QString s4 = QString::fromLatin1(copy3);
+        s3 = QString::fromLatin1(copy3);
+    }
+}
+
+typedef void (* FromLatin1Function)(ushort *, const char *, int);
+Q_DECLARE_METATYPE(FromLatin1Function)
+
+void fromLatin1_regular(ushort *dst, const char *str, int size)
+{
+    // from qstring.cpp:
+    while (size--)
+        *dst++ = (uchar)*str++;
+}
+
+#ifdef __SSE2__
+void fromLatin1_sse2_qt47(ushort *dst, const char *str, int size)
+{
+    if (size >= 16) {
+        int chunkCount = size >> 4; // divided by 16
+        const __m128i nullMask = _mm_set1_epi32(0);
+        for (int i = 0; i < chunkCount; ++i) {
+            const __m128i chunk = _mm_loadu_si128((__m128i*)str); // load
+            str += 16;
+
+            // unpack the first 8 bytes, padding with zeros
+            const __m128i firstHalf = _mm_unpacklo_epi8(chunk, nullMask);
+            _mm_storeu_si128((__m128i*)dst, firstHalf); // store
+            dst += 8;
+
+            // unpack the last 8 bytes, padding with zeros
+            const __m128i secondHalf = _mm_unpackhi_epi8 (chunk, nullMask);
+            _mm_storeu_si128((__m128i*)dst, secondHalf); // store
+            dst += 8;
+        }
+        size = size % 16;
+    }
+    while (size--)
+        *dst++ = (uchar)*str++;
+}
+
+static inline void fromLatin1_epilog(ushort *dst, const char *str, int size)
+{
+    if (!size) return;
+    dst[0] = (uchar)str[0];
+    if (!--size) return;
+    dst[1] = (uchar)str[1];
+    if (!--size) return;
+    dst[2] = (uchar)str[2];
+    if (!--size) return;
+    dst[3] = (uchar)str[3];
+    if (!--size) return;
+    dst[4] = (uchar)str[4];
+    if (!--size) return;
+    dst[5] = (uchar)str[5];
+    if (!--size) return;
+    dst[6] = (uchar)str[6];
+    if (!--size) return;
+    dst[7] = (uchar)str[7];
+    if (!--size) return;
+    dst[8] = (uchar)str[8];
+    if (!--size) return;
+    dst[9] = (uchar)str[9];
+    if (!--size) return;
+    dst[10] = (uchar)str[10];
+    if (!--size) return;
+    dst[11] = (uchar)str[11];
+    if (!--size) return;
+    dst[12] = (uchar)str[12];
+    if (!--size) return;
+    dst[13] = (uchar)str[13];
+    if (!--size) return;
+    dst[14] = (uchar)str[14];
+    if (!--size) return;
+    dst[15] = (uchar)str[15];
+}
+
+void fromLatin1_sse2_improved(ushort *dst, const char *str, int size)
+{
+    const __m128i nullMask = _mm_set1_epi32(0);
+    qptrdiff counter = 0;
+    size -= 16;
+    while (size >= counter) {
+        const __m128i chunk = _mm_loadu_si128((__m128i*)(str + counter)); // load
+
+        // unpack the first 8 bytes, padding with zeros
+        const __m128i firstHalf = _mm_unpacklo_epi8(chunk, nullMask);
+        _mm_storeu_si128((__m128i*)(dst + counter), firstHalf); // store
+
+        // unpack the last 8 bytes, padding with zeros
+        const __m128i secondHalf = _mm_unpackhi_epi8 (chunk, nullMask);
+        _mm_storeu_si128((__m128i*)(dst + counter + 8), secondHalf); // store
+
+        counter += 16;
+    }
+    size += 16;
+    fromLatin1_epilog(dst + counter, str + counter, size - counter);
+}
+
+void fromLatin1_sse2_improved2(ushort *dst, const char *str, int size)
+{
+    const __m128i nullMask = _mm_set1_epi32(0);
+    qptrdiff counter = 0;
+    size -= 32;
+    while (size >= counter) {
+        const __m128i chunk1 = _mm_loadu_si128((__m128i*)(str + counter)); // load
+        const __m128i chunk2 = _mm_loadu_si128((__m128i*)(str + counter + 16)); // load
+
+        // unpack the first 8 bytes, padding with zeros
+        const __m128i firstHalf1 = _mm_unpacklo_epi8(chunk1, nullMask);
+        _mm_storeu_si128((__m128i*)(dst + counter), firstHalf1); // store
+
+        // unpack the last 8 bytes, padding with zeros
+        const __m128i secondHalf1 = _mm_unpackhi_epi8(chunk1, nullMask);
+        _mm_storeu_si128((__m128i*)(dst + counter + 8), secondHalf1); // store
+
+        // unpack the first 8 bytes, padding with zeros
+        const __m128i firstHalf2 = _mm_unpacklo_epi8(chunk2, nullMask);
+        _mm_storeu_si128((__m128i*)(dst + counter + 16), firstHalf2); // store
+
+        // unpack the last 8 bytes, padding with zeros
+        const __m128i secondHalf2 = _mm_unpackhi_epi8(chunk2, nullMask);
+        _mm_storeu_si128((__m128i*)(dst + counter + 24), secondHalf2); // store
+
+        counter += 32;
+    }
+    size += 16;
+    if (size >= counter) {
+        const __m128i chunk = _mm_loadu_si128((__m128i*)(str + counter)); // load
+
+        // unpack the first 8 bytes, padding with zeros
+        const __m128i firstHalf = _mm_unpacklo_epi8(chunk, nullMask);
+        _mm_storeu_si128((__m128i*)(dst + counter), firstHalf); // store
+
+        // unpack the last 8 bytes, padding with zeros
+        const __m128i secondHalf = _mm_unpackhi_epi8 (chunk, nullMask);
+        _mm_storeu_si128((__m128i*)(dst + counter + 8), secondHalf); // store
+
+        counter += 16;
+    }
+    size += 16;
+    fromLatin1_epilog(dst + counter, str + counter, size - counter);
+}
+
+void fromLatin1_prolog_unrolled(ushort *dst, const char *str, int size)
+{
+    // QString's data pointer is most often ending in 0x2 or 0xa
+    // that means the two most common values for size are (8-1)=7 and (8-5)=3
+    if (size == 7)
+        goto copy_7;
+    if (size == 3)
+        goto copy_3;
+
+    if (size == 6)
+        goto copy_6;
+    if (size == 5)
+        goto copy_5;
+    if (size == 4)
+        goto copy_4;
+    if (size == 2)
+        goto copy_2;
+    if (size == 1)
+        goto copy_1;
+    return;
+
+copy_7:
+    dst[6] = (uchar)str[6];
+copy_6:
+    dst[5] = (uchar)str[5];
+copy_5:
+    dst[4] = (uchar)str[4];
+copy_4:
+    dst[3] = (uchar)str[3];
+copy_3:
+    dst[2] = (uchar)str[2];
+copy_2:
+    dst[1] = (uchar)str[1];
+copy_1:
+    dst[0] = (uchar)str[0];
+}
+
+void fromLatin1_prolog_sse2_overcommit(ushort *dst, const char *str, int)
+{
+    // do one iteration of conversion
+    const __m128i chunk = _mm_loadu_si128((__m128i*)str); // load
+
+    // unpack only the first 8 bytes, padding with zeros
+    const __m128i nullMask = _mm_set1_epi32(0);
+    const __m128i firstHalf = _mm_unpacklo_epi8(chunk, nullMask);
+    _mm_storeu_si128((__m128i*)dst, firstHalf); // store
+}
+
+template<FromLatin1Function prologFunction>
+void fromLatin1_sse2_withprolog(ushort *dst, const char *str, int size)
+{
+    // same as the improved code, but we attempt to align at the prolog
+    // therefore, we issue aligned stores
+
+    if (size >= 16) {
+        uint misalignment = uint(quintptr(dst) & 0xf);
+        uint prologCount = (16 - misalignment) / 2;
+
+        prologFunction(dst, str, prologCount);
+
+        size -= prologCount;
+        dst += prologCount;
+        str += prologCount;
+    }
+
+    const __m128i nullMask = _mm_set1_epi32(0);
+    qptrdiff counter = 0;
+    size -= 16;
+    while (size >= counter) {
+        const __m128i chunk = _mm_loadu_si128((__m128i*)(str + counter)); // load
+
+        // unpack the first 8 bytes, padding with zeros
+        const __m128i firstHalf = _mm_unpacklo_epi8(chunk, nullMask);
+        _mm_store_si128((__m128i*)(dst + counter), firstHalf); // store
+
+        // unpack the last 8 bytes, padding with zeros
+        const __m128i secondHalf = _mm_unpackhi_epi8 (chunk, nullMask);
+        _mm_store_si128((__m128i*)(dst + counter + 8), secondHalf); // store
+
+        counter += 16;
+    }
+    size += 16;
+    fromLatin1_epilog(dst + counter, str + counter, size - counter);
+}
+
+#ifdef __SSE4_1__
+void fromLatin1_sse4_pmovzxbw(ushort *dst, const char *str, int size)
+{
+    qptrdiff counter = 0;
+    size -= 16;
+    while (size >= counter) {
+        __m128i chunk = _mm_loadu_si128((__m128i*)(str + counter)); // load
+
+        // unpack the first 8 bytes, padding with zeros
+        const __m128i firstHalf = _mm_cvtepu8_epi16(chunk);
+        _mm_storeu_si128((__m128i*)(dst + counter), firstHalf); // store
+
+        // unpack the last 8 bytes, padding with zeros
+        chunk = _mm_srli_si128(chunk, 8);
+        const __m128i secondHalf = _mm_cvtepu8_epi16(chunk);
+        _mm_storeu_si128((__m128i*)(dst + counter + 8), secondHalf); // store
+
+        counter += 16;
+    }
+    size += 16;
+    fromLatin1_epilog(dst + counter, str + counter, size - counter);
+}
+
+void fromLatin1_prolog_sse4_overcommit(ushort *dst, const char *str, int)
+{
+    // load 8 bytes and zero-extend them to 16
+    const __m128i chunk = _mm_cvtepu8_epi16(*(__m128i*)str); // load
+    _mm_storeu_si128((__m128i*)dst, chunk); // store
+}
+#endif
+#endif
+
+#ifdef __ARM_NEON__
+static inline void fromLatin1_epilog(ushort *dst, const char *str, int size)
+{
+    if (!size) return;
+    dst[0] = (uchar)str[0];
+    if (!--size) return;
+    dst[1] = (uchar)str[1];
+    if (!--size) return;
+    dst[2] = (uchar)str[2];
+    if (!--size) return;
+    dst[3] = (uchar)str[3];
+    if (!--size) return;
+    dst[4] = (uchar)str[4];
+    if (!--size) return;
+    dst[5] = (uchar)str[5];
+    if (!--size) return;
+    dst[6] = (uchar)str[6];
+    if (!--size) return;
+    dst[7] = (uchar)str[7];
+    if (!--size) return;
+}
+
+void fromLatin1_neon_improved(ushort *dst, const char *str, int len)
+{
+    while (len >= 8) {
+        // load 8 bytes into one doubleword Neon register
+        const uint8x8_t chunk = vld1_u8((uint8_t *)str);
+        str += 8;
+
+        // expand 8 bytes into 16 bytes in a quadword register
+        const uint16x8_t expanded = vmovl_u8(chunk);
+        vst1q_u16(dst, expanded); // store
+        dst += 8;
+
+        len -= 8;
+    }
+    fromLatin1_epilog(dst, str, len);
+}
+
+void fromLatin1_neon_improved2(ushort *dst, const char *str, int len)
+{
+    while (len >= 16) {
+        // load 16 bytes into one quadword Neon register
+        const uint8x16_t chunk = vld1q_u8((uint8_t *)str);
+        str += 16;
+
+        // expand each doubleword of the quadword register into a quadword
+        const uint16x8_t expanded_low = vmovl_u8(vget_low_u8(chunk));
+        vst1q_u16(dst, expanded_low); // store
+        dst += 8;
+        const uint16x8_t expanded_high = vmovl_u8(vget_high_u8(chunk));
+        vst1q_u16(dst, expanded_high); // store
+        dst += 8;
+
+        len -= 16;
+    }
+
+    if (len >= 8) {
+        // load 8 bytes into one doubleword Neon register
+        const uint8x8_t chunk = vld1_u8((uint8_t *)str);
+        str += 8;
+
+        // expand 8 bytes into 16 bytes in a quadword register
+        const uint16x8_t expanded = vmovl_u8(chunk);
+        vst1q_u16(dst, expanded); // store
+        dst += 8;
+
+        len -= 8;
+    }
+    fromLatin1_epilog(dst, str, len);
+}
+
+void fromLatin1_neon_handwritten(ushort *dst, const char *str, int len)
+{
+    // same as above, but handwritten Neon
+    while (len >= 8) {
+        uint16x8_t chunk;
+        asm (
+            "vld1.8     %[chunk], [%[str]]!\n"
+            "vmovl.u8   %q[chunk], %[chunk]\n"
+            "vst1.16    %h[chunk], [%[dst]]!\n"
+            : [dst] "+r" (dst),
+              [str] "+r" (str),
+              [chunk] "=w" (chunk));
+        len -= 8;
+    }
+
+    fromLatin1_epilog(dst, str, len);
+}
+
+void fromLatin1_neon_handwritten2(ushort *dst, const char *str, int len)
+{
+    // same as above, but handwritten Neon
+    while (len >= 16) {
+        uint16x8_t chunk1, chunk2;
+        asm (
+            "vld1.8     %h[chunk1], [%[str]]!\n"
+            "vmovl.u8   %q[chunk2], %f[chunk1]\n"
+            "vmovl.u8   %q[chunk1], %e[chunk1]\n"
+            "vst1.16    %h[chunk1], [%[dst]]!\n"
+            "vst1.16    %h[chunk2], [%[dst]]!\n"
+          : [dst] "+r" (dst),
+            [str] "+r" (str),
+            [chunk1] "=w" (chunk1),
+            [chunk2] "=w" (chunk2));
+        len -= 16;
+    }
+
+    if (len >= 8) {
+        uint16x8_t chunk;
+        asm (
+            "vld1.8     %[chunk], [%[str]]!\n"
+            "vmovl.u8   %q[chunk], %[chunk]\n"
+            "vst1.16    %h[chunk], [%[dst]]!\n"
+            : [dst] "+r" (dst),
+              [str] "+r" (str),
+              [chunk] "=w" (chunk));
+        len -= 8;
+    }
+
+    fromLatin1_epilog(dst, str, len);
+}
+#endif
+
+void tst_QString::fromLatin1Alternatives_data() const
+{
+    QTest::addColumn<FromLatin1Function>("function");
+    QTest::newRow("empty") << FromLatin1Function(0);
+    QTest::newRow("regular") << &fromLatin1_regular;
+#ifdef __SSE2__
+    QTest::newRow("sse2-qt4.7") << &fromLatin1_sse2_qt47;
+    QTest::newRow("sse2-improved") << &fromLatin1_sse2_improved;
+    QTest::newRow("sse2-improved2") << &fromLatin1_sse2_improved2;
+    QTest::newRow("sse2-with-prolog-regular") << &fromLatin1_sse2_withprolog<&fromLatin1_regular>;
+    QTest::newRow("sse2-with-prolog-unrolled") << &fromLatin1_sse2_withprolog<&fromLatin1_prolog_unrolled>;
+    QTest::newRow("sse2-with-prolog-sse2-overcommit") << &fromLatin1_sse2_withprolog<&fromLatin1_prolog_sse2_overcommit>;
+#ifdef __SSE4_1__
+    QTest::newRow("sse2-with-prolog-sse4-overcommit") << &fromLatin1_sse2_withprolog<&fromLatin1_prolog_sse4_overcommit>;
+    QTest::newRow("sse4-pmovzxbw") << &fromLatin1_sse4_pmovzxbw;
+#endif
+#endif
+#ifdef __ARM_NEON__
+    QTest::newRow("neon-improved") << &fromLatin1_neon_improved;
+    QTest::newRow("neon-improved2") << &fromLatin1_neon_improved2;
+    QTest::newRow("neon-handwritten") << &fromLatin1_neon_handwritten;
+    QTest::newRow("neon-handwritten2") << &fromLatin1_neon_handwritten2;
+#endif
+}
+
+extern StringData fromLatin1Data;
+static void fromLatin1Alternatives_internal(FromLatin1Function function, QString &dst, bool doVerify)
+{
+    struct Entry
+    {
+        int len;
+        int offset1, offset2;
+        int align1, align2;
+    };
+    const Entry *entries = reinterpret_cast<const Entry *>(fromLatin1Data.entries);
+
+    for (int i = 0; i < fromLatin1Data.entryCount; ++i) {
+        int len = entries[i].len;
+        const char *src = fromLatin1Data.charData + entries[i].offset1;
+
+        if (!function)
+            continue;
+        if (!doVerify) {
+            (function)(&dst.data()->unicode(), src, len);
+        } else {
+            dst.fill(QChar('x'), dst.length());
+
+            (function)(&dst.data()->unicode() + 8, src, len);
+
+            QString zeroes(8, QChar('x'));
+            QString final = dst.mid(8, len);
+            QCOMPARE(final, QString::fromLatin1(src, len));
+            QCOMPARE(dst.left(8), zeroes);
+            QCOMPARE(dst.mid(len + 8, 8), zeroes);
+        }
+    }
+}
+
+void tst_QString::fromLatin1Alternatives() const
+{
+    QFETCH(FromLatin1Function, function);
+
+    QString dst(fromLatin1Data.maxLength + 16, QChar('x'));
+    fromLatin1Alternatives_internal(function, dst, true);
+
+    QBENCHMARK {
+        fromLatin1Alternatives_internal(function, dst, false);
+    }
+}
+
+typedef int (* FromUtf8Function)(ushort *, const char *, int);
+Q_DECLARE_METATYPE(FromUtf8Function)
+
+extern QTextCodec::ConverterState *state;
+QTextCodec::ConverterState *state = 0; // just because the code in qutfcodec.cpp uses a state
+
+int fromUtf8_latin1_regular(ushort *dst, const char *chars, int len)
+{
+    fromLatin1_regular(dst, chars, len);
+    return len;
+}
+
+#ifdef __SSE2__
+int fromUtf8_latin1_qt47(ushort *dst, const char *chars, int len)
+{
+    fromLatin1_sse2_qt47(dst, chars, len);
+    return len;
+}
+
+int fromUtf8_latin1_sse2_improved(ushort *dst, const char *chars, int len)
+{
+    fromLatin1_sse2_improved(dst, chars, len);
+    return len;
+}
+#endif
+
+static inline bool isUnicodeNonCharacter(uint ucs4)
+{
+    // Unicode has a couple of "non-characters" that one can use internally,
+    // but are not allowed to be used for text interchange.
+    //
+    // Those are the last two entries each Unicode Plane (U+FFFE, U+FFFF,
+    // U+1FFFE, U+1FFFF, etc.) as well as the entries between U+FDD0 and
+    // U+FDEF (inclusive)
+
+    return (ucs4 & 0xfffe) == 0xfffe
+            || (ucs4 - 0xfdd0U) < 16;
+}
+
+int fromUtf8_qt47(ushort *dst, const char *chars, int len)
+{
+    // this is almost the code found in Qt 4.7's qutfcodec.cpp QUtf8Codec::convertToUnicode
+    // That function returns a QString, this one returns the number of characters converted
+    // That's to avoid doing malloc() inside the benchmark test
+    // Any differences between this code and the original are just because of that, I promise
+
+    bool headerdone = false;
+    ushort replacement = QChar::ReplacementCharacter;
+    int need = 0;
+    int error = -1;
+    uint uc = 0;
+    uint min_uc = 0;
+    if (state) {
+        if (state->flags & QTextCodec::IgnoreHeader)
+            headerdone = true;
+        if (state->flags & QTextCodec::ConvertInvalidToNull)
+            replacement = QChar::Null;
+        need = state->remainingChars;
+        if (need) {
+            uc = state->state_data[0];
+            min_uc = state->state_data[1];
+        }
+    }
+    if (!headerdone && len > 3
+        && (uchar)chars[0] == 0xef && (uchar)chars[1] == 0xbb && (uchar)chars[2] == 0xbf) {
+        // starts with a byte order mark
+        chars += 3;
+        len -= 3;
+        headerdone = true;
+    }
+
+    // QString result(need + len + 1, Qt::Uninitialized); // worst case
+    // ushort *qch = (ushort *)result.unicode();
+    ushort *qch = dst;
+    uchar ch;
+    int invalid = 0;
+
+    for (int i = 0; i < len; ++i) {
+        ch = chars[i];
+        if (need) {
+            if ((ch&0xc0) == 0x80) {
+                uc = (uc << 6) | (ch & 0x3f);
+                --need;
+                if (!need) {
+                    // utf-8 bom composes into 0xfeff code point
+                    bool nonCharacter;
+                    if (!headerdone && uc == 0xfeff) {
+                        // don't do anything, just skip the BOM
+                    } else if (!(nonCharacter = isUnicodeNonCharacter(uc)) && uc > 0xffff && uc < 0x110000) {
+                        // surrogate pair
+                        //Q_ASSERT((qch - (ushort*)result.unicode()) + 2 < result.length());
+                        *qch++ = QChar::highSurrogate(uc);
+                        *qch++ = QChar::lowSurrogate(uc);
+                    } else if ((uc < min_uc) || (uc >= 0xd800 && uc <= 0xdfff) || nonCharacter || uc >= 0x110000) {
+                        // error: overlong sequence, UTF16 surrogate or non-character
+                        *qch++ = replacement;
+                        ++invalid;
+                    } else {
+                        *qch++ = uc;
+                    }
+                    headerdone = true;
+                }
+            } else {
+                // error
+                i = error;
+                *qch++ = replacement;
+                ++invalid;
+                need = 0;
+                headerdone = true;
+            }
+        } else {
+            if (ch < 128) {
+                *qch++ = ushort(ch);
+                headerdone = true;
+            } else if ((ch & 0xe0) == 0xc0) {
+                uc = ch & 0x1f;
+                need = 1;
+                error = i;
+                min_uc = 0x80;
+                headerdone = true;
+            } else if ((ch & 0xf0) == 0xe0) {
+                uc = ch & 0x0f;
+                need = 2;
+                error = i;
+                min_uc = 0x800;
+            } else if ((ch&0xf8) == 0xf0) {
+                uc = ch & 0x07;
+                need = 3;
+                error = i;
+                min_uc = 0x10000;
+                headerdone = true;
+            } else {
+                // error
+                *qch++ = replacement;
+                ++invalid;
+                headerdone = true;
+            }
+        }
+    }
+    if (!state && need > 0) {
+        // unterminated UTF sequence
+        for (int i = error; i < len; ++i) {
+            *qch++ = replacement;
+            ++invalid;
+        }
+    }
+    //result.truncate(qch - (ushort *)result.unicode());
+    if (state) {
+        state->invalidChars += invalid;
+        state->remainingChars = need;
+        if (headerdone)
+            state->flags |= QTextCodec::IgnoreHeader;
+        state->state_data[0] = need ? uc : 0;
+        state->state_data[1] = need ? min_uc : 0;
+    }
+    //return result;
+    return qch - dst;
+}
+
+int fromUtf8_qt47_stateless(ushort *dst, const char *chars, int len)
+{
+    // This is the same code as above, but for stateless UTF-8 conversion
+    // no other improvements
+    bool headerdone = false;
+    const ushort replacement = QChar::ReplacementCharacter;
+    int need = 0;
+    int error = -1;
+    uint uc = 0;
+    uint min_uc = 0;
+
+    if (len > 3
+        && (uchar)chars[0] == 0xef && (uchar)chars[1] == 0xbb && (uchar)chars[2] == 0xbf) {
+        // starts with a byte order mark
+        chars += 3;
+        len -= 3;
+    }
+
+    // QString result(need + len + 1, Qt::Uninitialized); // worst case
+    // ushort *qch = (ushort *)result.unicode();
+    ushort *qch = dst;
+    uchar ch;
+    int invalid = 0;
+
+    for (int i = 0; i < len; ++i) {
+        ch = chars[i];
+        if (need) {
+            if ((ch&0xc0) == 0x80) {
+                uc = (uc << 6) | (ch & 0x3f);
+                --need;
+                if (!need) {
+                    // utf-8 bom composes into 0xfeff code point
+                    bool nonCharacter;
+                    if (!headerdone && uc == 0xfeff) {
+                        // don't do anything, just skip the BOM
+                    } else if (!(nonCharacter = isUnicodeNonCharacter(uc)) && uc > 0xffff && uc < 0x110000) {
+                        // surrogate pair
+                        //Q_ASSERT((qch - (ushort*)result.unicode()) + 2 < result.length());
+                        *qch++ = QChar::highSurrogate(uc);
+                        *qch++ = QChar::lowSurrogate(uc);
+                    } else if ((uc < min_uc) || (uc >= 0xd800 && uc <= 0xdfff) || nonCharacter || uc >= 0x110000) {
+                        // error: overlong sequence, UTF16 surrogate or non-character
+                        *qch++ = replacement;
+                        ++invalid;
+                    } else {
+                        *qch++ = uc;
+                    }
+                    headerdone = true;
+                }
+            } else {
+                // error
+                i = error;
+                *qch++ = replacement;
+                ++invalid;
+                need = 0;
+                headerdone = true;
+            }
+        } else {
+            if (ch < 128) {
+                *qch++ = ushort(ch);
+                headerdone = true;
+            } else if ((ch & 0xe0) == 0xc0) {
+                uc = ch & 0x1f;
+                need = 1;
+                error = i;
+                min_uc = 0x80;
+                headerdone = true;
+            } else if ((ch & 0xf0) == 0xe0) {
+                uc = ch & 0x0f;
+                need = 2;
+                error = i;
+                min_uc = 0x800;
+            } else if ((ch&0xf8) == 0xf0) {
+                uc = ch & 0x07;
+                need = 3;
+                error = i;
+                min_uc = 0x10000;
+                headerdone = true;
+            } else {
+                // error
+                *qch++ = replacement;
+                ++invalid;
+                headerdone = true;
+            }
+        }
+    }
+    if (need > 0) {
+        // unterminated UTF sequence
+        for (int i = error; i < len; ++i) {
+            *qch++ = replacement;
+            ++invalid;
+        }
+    }
+    //result.truncate(qch - (ushort *)result.unicode());
+    //return result;
+    return qch - dst;
+}
+
+template <bool trusted>
+static inline void extract_utf8_multibyte(ushort *&dst, const char *&chars, qptrdiff &counter, int &len)
+{
+    uchar ch = chars[counter];
+
+    // is it a leading or a continuation one?
+    if (!trusted && (ch & 0xc0) == 0x80) {
+        // continuation character found without the leading
+        dst[counter++] = QChar::ReplacementCharacter;
+        return;
+    }
+
+    if ((ch & 0xe0) == 0xc0) {
+        // two-byte UTF-8 sequence
+        if (!trusted && counter + 1 == len) {
+            dst[counter++] = QChar::ReplacementCharacter;
+            return;
+        }
+
+        uchar ch2 = chars[counter + 1];
+        if (!trusted)
+            if ((ch2 & 0xc0) != 0x80) {
+                dst[counter++] = QChar::ReplacementCharacter;
+                return;
+            }
+
+        ushort ucs = (ch & 0x1f);
+        ucs <<= 6;
+        ucs |= (ch2 & 0x3f);
+
+        // dst[counter] will correspond to chars[counter..counter+1], so adjust
+        ++chars;
+        --len;
+        if (trusted || ucs >= 0x80)
+            dst[counter] = ucs;
+        else
+            dst[counter] = QChar::ReplacementCharacter;
+        ++counter;
+        return;
+    }
+
+    if ((ch & 0xf0) == 0xe0) {
+        // three-byte UTF-8 sequence
+        if (!trusted && counter + 2 >= len) {
+            dst[counter++] = QChar::ReplacementCharacter;
+            return;
+        }
+
+        uchar ch2 = chars[counter + 1];
+        uchar ch3 = chars[counter + 2];
+        if (!trusted)
+            if ((ch2 & 0xc0) != 0x80 || (ch3 & 0xc0) != 0x80) {
+                dst[counter++] = QChar::ReplacementCharacter;
+                return;
+            }
+
+        ushort ucs = (ch & 0x1f) << 12 | (ch2 & 0x3f) << 6 | (ch3 & 0x3f);
+
+        // dst[counter] will correspond to chars[counter..counter+2], so adjust
+        chars += 2;
+        len -= 2;
+        if (!trusted &&
+            (ucs < 0x800 || isUnicodeNonCharacter(ucs) || (ucs >= 0xd800 && ucs <= 0xdfff)))
+            dst[counter] = QChar::ReplacementCharacter;
+        else
+            dst[counter] = ucs;
+        ++counter;
+        return;
+    }
+
+    if ((ch & 0xf8) == 0xf0) {
+        // four-byte UTF-8 sequence
+        // will require an UTF-16 surrogate pair
+        if (!trusted && counter + 3 >= len) {
+            dst[counter++] = QChar::ReplacementCharacter;
+            return;
+        }
+
+        uchar ch2 = chars[counter + 1];
+        uchar ch3 = chars[counter + 2];
+        uchar ch4 = chars[counter + 3];
+        if (!trusted)
+            if ((ch2 & 0xc0) != 0x80 || (ch3 & 0xc0) != 0x80 || (ch4 & 0xc0) != 0x80) {
+                dst[counter++] = QChar::ReplacementCharacter;
+                return;
+            }
+
+        uint ucs = (ch & 0x1f) << 18 | (ch2 & 0x3f) << 12
+                   | (ch3 & 0x3f) << 6 | (ch4 & 0x3f);
+
+        // dst[counter] will correspond to chars[counter..counter+2], so adjust
+        chars += 3;
+        len -= 3;
+        if (trusted || (ucs >= 0x10000 && ucs < 0x110000 && !isUnicodeNonCharacter(ucs))) {
+            dst[counter + 0] = QChar::highSurrogate(ucs);
+            dst[counter + 1] = QChar::lowSurrogate(ucs);
+            counter += 2;
+        } else {
+            dst[counter++] = QChar::ReplacementCharacter;
+        }
+        return;
+    }
+
+    ++counter;
+}
+
+int fromUtf8_optimised_for_ascii(ushort *qch, const char *chars, int len)
+{
+    if (len > 3
+        && (uchar)chars[0] == 0xef && (uchar)chars[1] == 0xbb && (uchar)chars[2] == 0xbf) {
+        // starts with a byte order mark
+        chars += 3;
+        len -= 3;
+    }
+
+    qptrdiff counter = 0;
+    ushort *dst = qch;
+    while (counter < len) {
+        uchar ch = chars[counter];
+        if ((ch & 0x80) == 0) {
+            dst[counter] = ch;
+            ++counter;
+            continue;
+        }
+
+        // UTF-8 character found
+        extract_utf8_multibyte<false>(dst, chars, counter, len);
+    }
+    return dst + counter - qch;
+}
+
+#ifdef __SSE2__
+int fromUtf8_sse2_optimised_for_ascii(ushort *qch, const char *chars, int len)
+{
+    if (len > 3
+        && (uchar)chars[0] == 0xef && (uchar)chars[1] == 0xbb && (uchar)chars[2] == 0xbf) {
+        // starts with a byte order mark
+        chars += 3;
+        len -= 3;
+    }
+
+    qptrdiff counter = 0;
+    ushort *dst = qch;
+
+    len -= 16;
+    const __m128i nullMask = _mm_set1_epi32(0);
+    while (counter < len) {
+        const __m128i chunk = _mm_loadu_si128((__m128i*)(chars + counter)); // load
+        ushort highbytes = _mm_movemask_epi8(chunk);
+
+        // unpack the first 8 bytes, padding with zeros
+        const __m128i firstHalf = _mm_unpacklo_epi8(chunk, nullMask);
+        _mm_storeu_si128((__m128i*)(dst + counter), firstHalf); // store
+
+        if (!uchar(highbytes)) {
+            // unpack the last 8 bytes, padding with zeros
+            const __m128i secondHalf = _mm_unpackhi_epi8 (chunk, nullMask);
+            _mm_storeu_si128((__m128i*)(dst + counter + 8), secondHalf); // store
+
+            if (!highbytes) {
+                counter += 16;
+                continue;
+            }
+        }
+
+        // UTF-8 character found
+        // which one?
+        counter += bsf_nonzero(highbytes);
+        len += 16;
+        extract_utf8_multibyte<false>(dst, chars, counter, len);
+        len -= 16;
+    }
+    len += 16;
+
+    while (counter < len) {
+        uchar ch = chars[counter];
+        if ((ch & 0x80) == 0) {
+            dst[counter] = ch;
+            ++counter;
+            continue;
+        }
+
+        // UTF-8 character found
+        extract_utf8_multibyte<false>(dst, chars, counter, len);
+    }
+    return dst + counter - qch;
+}
+
+int fromUtf8_sse2_trusted_no_bom(ushort *qch, const char *chars, int len)
+{
+    qptrdiff counter = 0;
+    ushort *dst = qch;
+
+    len -= 16;
+    const __m128i nullMask = _mm_set1_epi32(0);
+    while (counter < len) {
+        const __m128i chunk = _mm_loadu_si128((__m128i*)(chars + counter)); // load
+        ushort highbytes = _mm_movemask_epi8(chunk);
+
+        // unpack the first 8 bytes, padding with zeros
+        const __m128i firstHalf = _mm_unpacklo_epi8(chunk, nullMask);
+        _mm_storeu_si128((__m128i*)(dst + counter), firstHalf); // store
+
+        if (!uchar(highbytes)) {
+            // unpack the last 8 bytes, padding with zeros
+            const __m128i secondHalf = _mm_unpackhi_epi8 (chunk, nullMask);
+            _mm_storeu_si128((__m128i*)(dst + counter + 8), secondHalf); // store
+
+            if (!highbytes) {
+                counter += 16;
+                continue;
+            }
+        }
+
+        // UTF-8 character found
+        // which one?
+        counter += bsf_nonzero(highbytes);
+        len += 16;
+        extract_utf8_multibyte<true>(dst, chars, counter, len);
+        len -= 16;
+    }
+    len += 16;
+
+    while (counter < len) {
+        uchar ch = chars[counter];
+        if ((ch & 0x80) == 0) {
+            dst[counter] = ch;
+            ++counter;
+            continue;
+        }
+
+        // UTF-8 character found
+        extract_utf8_multibyte<true>(dst, chars, counter, len);
+    }
+    return dst + counter - qch;
+}
+#endif
+
+#ifdef __ARM_NEON__
+int fromUtf8_latin1_neon(ushort *dst, const char *chars, int len)
+{
+    fromLatin1_neon_improved(dst, chars, len);
+    return len;
+}
+
+int fromUtf8_neon(ushort *qch, const char *chars, int len)
+{
+    if (len > 3
+        && (uchar)chars[0] == 0xef && (uchar)chars[1] == 0xbb && (uchar)chars[2] == 0xbf) {
+        // starts with a byte order mark
+        chars += 3;
+        len -= 3;
+    }
+
+    ushort *dst = qch;
+    const uint8x8_t highBit = vdup_n_u8(0x80);
+    while (len >= 8) {
+        // load 8 bytes into one doubleword Neon register
+        const uint8x8_t chunk = vld1_u8((uint8_t *)chars);
+        const uint16x8_t expanded = vmovl_u8(chunk);
+        vst1q_u16(dst, expanded);
+
+        uint8x8_t highBits = vtst_u8(chunk, highBit);
+        // we need to find the lowest byte set
+        int mask_low = vget_lane_u32(vreinterpret_u32_u8(highBits), 0);
+        int mask_high = vget_lane_u32(vreinterpret_u32_u8(highBits), 1);
+
+        if (__builtin_expect(mask_low == 0 && mask_high == 0, 1)) {
+            chars += 8;
+            dst += 8;
+            len -= 8;
+        } else {
+            // UTF-8 character found
+            // which one?
+            qptrdiff pos;
+            asm ("rbit  %0, %1\n"
+                 "clz   %1, %1\n"
+               : "=r" (pos)
+               : "r" (mask_low ? mask_low : mask_high));
+            // now mask_low contains the number of leading zeroes
+            // or the value 32 (0x20) if no zeroes were found
+            // the number of leading zeroes is 8*pos
+            pos /= 8;
+
+            extract_utf8_multibyte<false>(dst, chars, pos, len);
+            chars += pos;
+            dst += pos;
+            len -= pos;
+        }
+    }
+
+    qptrdiff counter = 0;
+    while (counter < len) {
+        uchar ch = chars[counter];
+        if ((ch & 0x80) == 0) {
+            dst[counter] = ch;
+            ++counter;
+            continue;
+        }
+        // UTF-8 character found
+        extract_utf8_multibyte<false>(dst, chars, counter, len);
+    }
+    return dst + counter - qch;
+}
+
+int fromUtf8_neon_trusted(ushort *qch, const char *chars, int len)
+{
+    ushort *dst = qch;
+    const uint8x8_t highBit = vdup_n_u8(0x80);
+    while (len >= 8) {
+        // load 8 bytes into one doubleword Neon register
+        const uint8x8_t chunk = vld1_u8((uint8_t *)chars);
+        const uint16x8_t expanded = vmovl_u8(chunk);
+        vst1q_u16(dst, expanded);
+
+        uint8x8_t highBits = vtst_u8(chunk, highBit);
+        // we need to find the lowest byte set
+        int mask_low = vget_lane_u32(vreinterpret_u32_u8(highBits), 0);
+        int mask_high = vget_lane_u32(vreinterpret_u32_u8(highBits), 1);
+
+        if (__builtin_expect(mask_low == 0 && mask_high == 0, 1)) {
+            chars += 8;
+            dst += 8;
+            len -= 8;
+        } else {
+            // UTF-8 character found
+            // which one?
+            qptrdiff pos;
+            asm ("rbit  %0, %1\n"
+                 "clz   %1, %1\n"
+               : "=r" (pos)
+               : "r" (mask_low ? mask_low : mask_high));
+            // now mask_low contains the number of leading zeroes
+            // or the value 32 (0x20) if no zeroes were found
+            // the number of leading zeroes is 8*pos
+            pos /= 8;
+
+            extract_utf8_multibyte<true>(dst, chars, pos, len);
+            chars += pos;
+            dst += pos;
+            len -= pos;
+        }
+    }
+
+    qptrdiff counter = 0;
+    while (counter < len) {
+        uchar ch = chars[counter];
+        if ((ch & 0x80) == 0) {
+            dst[counter] = ch;
+            ++counter;
+            continue;
+        }
+
+        // UTF-8 character found
+        extract_utf8_multibyte<true>(dst, chars, counter, len);
+    }
+    return dst + counter - qch;
+}
+#endif
+
+void tst_QString::fromUtf8Alternatives_data() const
+{
+    QTest::addColumn<FromUtf8Function>("function");
+    QTest::newRow("empty") << FromUtf8Function(0);
+    QTest::newRow("qt-4.7") << &fromUtf8_qt47;
+    QTest::newRow("qt-4.7-stateless") << &fromUtf8_qt47_stateless;
+    QTest::newRow("optimized-for-ascii") << &fromUtf8_optimised_for_ascii;
+#ifdef __SSE2__
+    QTest::newRow("sse2-optimized-for-ascii") << &fromUtf8_sse2_optimised_for_ascii;
+    QTest::newRow("sse2-trusted-no-bom") << &fromUtf8_sse2_trusted_no_bom;
+#endif
+#ifdef __ARM_NEON__
+    QTest::newRow("neon") << &fromUtf8_neon;
+    QTest::newRow("neon-trusted-no-bom") << &fromUtf8_neon_trusted;
+#endif
+
+    QTest::newRow("latin1-generic") << &fromUtf8_latin1_regular;
+#ifdef __SSE2__
+    QTest::newRow("latin1-sse2-qt4.7") << &fromUtf8_latin1_qt47;
+    QTest::newRow("latin1-sse2-improved") << &fromUtf8_latin1_sse2_improved;
+#endif
+#ifdef __ARM_NEON__
+    QTest::newRow("latin1-neon-improved") << &fromUtf8_latin1_neon;
+#endif
+}
+
+extern StringData fromUtf8Data;
+static void fromUtf8Alternatives_internal(FromUtf8Function function, QString &dst, bool doVerify)
+{
+    if (!doVerify) {
+        // NOTE: this only works because the Latin1 data is ASCII-only
+        fromLatin1Alternatives_internal(reinterpret_cast<FromLatin1Function>(function), dst, doVerify);
+    } else {
+        if (strncmp(QTest::currentDataTag(), "latin1-", 7) == 0)
+            return;
+    }
+
+    struct Entry
+    {
+        int len;
+        int offset1, offset2;
+        int align1, align2;
+    };
+    const Entry *entries = reinterpret_cast<const Entry *>(fromUtf8Data.entries);
+
+    for (int i = 0; i < fromUtf8Data.entryCount; ++i) {
+        int len = entries[i].len;
+        const char *src = fromUtf8Data.charData + entries[i].offset1;
+
+        if (!function)
+            continue;
+        if (!doVerify) {
+            (function)(&dst.data()->unicode(), src, len);
+        } else {
+            dst.fill(QChar('x'), dst.length());
+
+            int utf8len = (function)(&dst.data()->unicode() + 8, src, len);
+
+            QString expected = QString::fromUtf8(src, len);
+            QString final = dst.mid(8, expected.length());
+            if (final != expected || utf8len != expected.length())
+                qDebug() << i << entries[i].offset1 << utf8len << final << expected.length() << expected;
+
+            QCOMPARE(final, expected);
+            QCOMPARE(utf8len, expected.length());
+
+            QString zeroes(8, QChar('x'));
+            QCOMPARE(dst.left(8), zeroes);
+            QCOMPARE(dst.mid(len + 8, 8), zeroes);
+        }
+    }
+}
+
+void tst_QString::fromUtf8Alternatives() const
+{
+    QFETCH(FromUtf8Function, function);
+
+    QString dst(fromUtf8Data.maxLength + 16, QChar('x'));
+    fromUtf8Alternatives_internal(function, dst, true);
+
+    QBENCHMARK {
+        fromUtf8Alternatives_internal(function, dst, false);
+    }
+}
+
+QTEST_MAIN(tst_QString)
+
+#include "main.moc"
-- 
cgit v1.2.3