diff options
Diffstat (limited to 'src/corelib/tools/qstring.cpp')
-rw-r--r-- | src/corelib/tools/qstring.cpp | 825 |
1 files changed, 482 insertions, 343 deletions
diff --git a/src/corelib/tools/qstring.cpp b/src/corelib/tools/qstring.cpp index 2c505ef033..f5e25f1de9 100644 --- a/src/corelib/tools/qstring.cpp +++ b/src/corelib/tools/qstring.cpp @@ -1,6 +1,7 @@ /**************************************************************************** ** ** Copyright (C) 2013 Digia Plc and/or its subsidiary(-ies). +** Copyright (C) 2013 Intel Corporation ** Contact: http://www.qt-project.org/legal ** ** This file is part of the QtCore module of the Qt Toolkit. @@ -101,6 +102,43 @@ QT_BEGIN_NAMESPACE +/* + * Note on the use of SIMD in qstring.cpp: + * + * Several operations with strings are improved with the use of SIMD code, + * since they are repetitive. For MIPS, we have hand-written assembly code + * outside of qstring.cpp targeting MIPS DSP and MIPS DSPr2. For ARM and for + * x86, we can only use intrinsics and therefore everything is contained in + * qstring.cpp. We need to use intrinsics only for those platforms due to the + * different compilers and toolchains used, which have different syntax for + * assembly sources. + * + * ** SSE notes: ** + * + * Whenever multiple alternatives are equivalent or near so, we prefer the one + * using instructions from SSE2, since SSE2 is guaranteed to be enabled for all + * 64-bit builds and we enable it for 32-bit builds by default. Use of higher + * SSE versions should be done when there's a clear performance benefit and + * requires fallback code to SSE2, if it exists. + * + * Performance measurement in the past shows that most strings are short in + * size and, therefore, do not benefit from alignment prologues. That is, + * trying to find a 16-byte-aligned boundary to operate on is often more + * expensive than executing the unaligned operation directly. In addition, note + * that the QString private data is designed so that the data is stored on + * 16-byte boundaries if the system malloc() returns 16-byte aligned pointers + * on its own (64-bit glibc on Linux does; 32-bit glibc on Linux returns them + * 50% of the time), so skipping the alignment prologue is actually optimizing + * for the common case. + */ + +#if defined(__mips_dsp) +// From qstring_mips_dsp_asm.S +extern "C" void qt_fromlatin1_mips_asm_unroll4 (ushort*, const char*, uint); +extern "C" void qt_fromlatin1_mips_asm_unroll8 (ushort*, const char*, uint); +extern "C" void qt_toLatin1_mips_dsp_asm(uchar *dst, const ushort *src, int length); +#endif + // internal int qFindString(const QChar *haystack, int haystackLen, int from, const QChar *needle, int needleLen, Qt::CaseSensitivity cs); @@ -124,6 +162,194 @@ static inline bool qt_ends_with(const QChar *haystack, int haystackLen, static inline bool qt_ends_with(const QChar *haystack, int haystackLen, QLatin1String needle, Qt::CaseSensitivity cs); +#ifdef Q_COMPILER_LAMBDA +namespace { +template <uint MaxCount> struct UnrollTailLoop +{ + template <typename RetType, typename Functor1, typename Functor2> + static inline RetType exec(int count, RetType returnIfExited, Functor1 loopCheck, Functor2 returnIfFailed, int i = 0) + { + /* equivalent to: + * while (count--) { + * if (loopCheck(i)) + * return returnIfFailed(i); + * } + * return returnIfExited; + */ + + if (!count) + return returnIfExited; + + bool check = loopCheck(i); + if (check) { + const RetType &retval = returnIfFailed(i); + return retval; + } + + return UnrollTailLoop<MaxCount - 1>::exec(count - 1, returnIfExited, loopCheck, returnIfFailed, i + 1); + } +}; +template <> template <typename RetType, typename Functor1, typename Functor2> +inline RetType UnrollTailLoop<0>::exec(int, RetType returnIfExited, Functor1, Functor2, int) +{ + return returnIfExited; +} +} +#endif + +// conversion between Latin 1 and UTF-16 +static void qt_from_latin1(ushort *dst, const char *str, size_t size) +{ + /* SIMD: + * Unpacking with SSE has been shown to improve performance on recent CPUs + * The same method gives no improvement with NEON. + */ +#if defined(__SSE2__) + if (size >= 16) { + int chunkCount = size >> 4; // divided by 16 + const __m128i nullMask = _mm_set1_epi32(0); + for (int i = 0; i < chunkCount; ++i) { + const __m128i chunk = _mm_loadu_si128((__m128i*)str); // load + str += 16; + + // unpack the first 8 bytes, padding with zeros + const __m128i firstHalf = _mm_unpacklo_epi8(chunk, nullMask); + _mm_storeu_si128((__m128i*)dst, firstHalf); // store + dst += 8; + + // unpack the last 8 bytes, padding with zeros + const __m128i secondHalf = _mm_unpackhi_epi8 (chunk, nullMask); + _mm_storeu_si128((__m128i*)dst, secondHalf); // store + dst += 8; + } + size = size % 16; + } +#endif +#if defined(__mips_dsp) + if (size > 20) + qt_fromlatin1_mips_asm_unroll8(dst, str, size); + else + qt_fromlatin1_mips_asm_unroll4(dst, str, size); +#else + while (size--) + *dst++ = (uchar)*str++; +#endif +} + +#if defined(__SSE2__) +static inline __m128i mergeQuestionMarks(__m128i chunk) +{ + const __m128i questionMark = _mm_set1_epi16('?'); + +# ifdef __SSE4_2__ + // compare the unsigned shorts for the range 0x0100-0xFFFF + // note on the use of _mm_cmpestrm: + // The MSDN documentation online (http://technet.microsoft.com/en-us/library/bb514080.aspx) + // says for range search the following: + // For each character c in a, determine whether b0 <= c <= b1 or b2 <= c <= b3 + // + // However, all examples on the Internet, including from Intel + // (see http://software.intel.com/en-us/articles/xml-parsing-accelerator-with-intel-streaming-simd-extensions-4-intel-sse4/) + // put the range to be searched first + // + // Disassembly and instruction-level debugging with GCC and ICC show + // that they are doing the right thing. Inverting the arguments in the + // instruction does cause a bunch of test failures. + + const int mode = _SIDD_UWORD_OPS | _SIDD_CMP_RANGES | _SIDD_UNIT_MASK; + const __m128i rangeMatch = _mm_cvtsi32_si128(0xffff0100); + const __m128i offLimitMask = _mm_cmpestrm(rangeMatch, 2, chunk, 8, mode); + + // replace the non-Latin 1 characters in the chunk with question marks + chunk = _mm_blendv_epi8(chunk, questionMark, offLimitMask); +# else + // SSE has no compare instruction for unsigned comparison. + // The variables must be shiffted + 0x8000 to be compared + const __m128i signedBitOffset = _mm_set1_epi16(short(0x8000)); + const __m128i thresholdMask = _mm_set1_epi16(short(0xff + 0x8000)); + + const __m128i signedChunk = _mm_add_epi16(chunk, signedBitOffset); + const __m128i offLimitMask = _mm_cmpgt_epi16(signedChunk, thresholdMask); + +# ifdef __SSE4_1__ + // replace the non-Latin 1 characters in the chunk with question marks + chunk = _mm_blendv_epi8(chunk, questionMark, offLimitMask); +# else + // offLimitQuestionMark contains '?' for each 16 bits that was off-limit + // the 16 bits that were correct contains zeros + const __m128i offLimitQuestionMark = _mm_and_si128(offLimitMask, questionMark); + + // correctBytes contains the bytes that were in limit + // the 16 bits that were off limits contains zeros + const __m128i correctBytes = _mm_andnot_si128(offLimitMask, chunk); + + // merge offLimitQuestionMark and correctBytes to have the result + chunk = _mm_or_si128(correctBytes, offLimitQuestionMark); +# endif +# endif + return chunk; +} +#endif + +static void qt_to_latin1(uchar *dst, const ushort *src, int length) +{ + if (length) { +#if defined(__SSE2__) + if (length >= 16) { + const int chunkCount = length >> 4; // divided by 16 + + for (int i = 0; i < chunkCount; ++i) { + __m128i chunk1 = _mm_loadu_si128((__m128i*)src); // load + chunk1 = mergeQuestionMarks(chunk1); + src += 8; + + __m128i chunk2 = _mm_loadu_si128((__m128i*)src); // load + chunk2 = mergeQuestionMarks(chunk2); + src += 8; + + // pack the two vector to 16 x 8bits elements + const __m128i result = _mm_packus_epi16(chunk1, chunk2); + + _mm_storeu_si128((__m128i*)dst, result); // store + dst += 16; + } + length = length % 16; + } +#elif defined(__ARM_NEON__) + // Refer to the documentation of the SSE2 implementation + // this use eactly the same method as for SSE except: + // 1) neon has unsigned comparison + // 2) packing is done to 64 bits (8 x 8bits component). + if (length >= 16) { + const int chunkCount = length >> 3; // divided by 8 + const uint16x8_t questionMark = vdupq_n_u16('?'); // set + const uint16x8_t thresholdMask = vdupq_n_u16(0xff); // set + for (int i = 0; i < chunkCount; ++i) { + uint16x8_t chunk = vld1q_u16((uint16_t *)src); // load + src += 8; + + const uint16x8_t offLimitMask = vcgtq_u16(chunk, thresholdMask); // chunk > thresholdMask + const uint16x8_t offLimitQuestionMark = vandq_u16(offLimitMask, questionMark); // offLimitMask & questionMark + const uint16x8_t correctBytes = vbicq_u16(chunk, offLimitMask); // !offLimitMask & chunk + chunk = vorrq_u16(correctBytes, offLimitQuestionMark); // correctBytes | offLimitQuestionMark + const uint8x8_t result = vmovn_u16(chunk); // narrowing move->packing + vst1_u8(dst, result); // store + dst += 8; + } + length = length % 8; + } +#endif +#if defined(__mips_dsp) + qt_toLatin1_mips_dsp_asm(dst, src, length); +#else + while (length--) { + *dst++ = (*src>0xff) ? '?' : (uchar) *src; + ++src; + } +#endif + } +} + // Unicode case-insensitive comparison static int ucstricmp(const ushort *a, const ushort *ae, const ushort *b, const ushort *be) { @@ -205,83 +431,39 @@ static int ucstrncmp(const QChar *a, const QChar *b, int l) l); } #endif // __mips_dsp - while (l-- && *a == *b) - a++,b++; - if (l==-1) - return 0; - return a->unicode() - b->unicode(); -} - -// Unicode case-sensitive comparison -static int ucstrcmp(const QChar *a, int alen, const QChar *b, int blen) -{ - if (a == b && alen == blen) +#ifdef __SSE2__ + const char *ptr = reinterpret_cast<const char*>(a); + qptrdiff distance = reinterpret_cast<const char*>(b) - ptr; + a += l & ~7; + b += l & ~7; + l &= 7; + + // we're going to read ptr[0..15] (16 bytes) + for ( ; ptr + 15 < reinterpret_cast<const char *>(a); ptr += 16) { + __m128i a_data = _mm_loadu_si128((__m128i*)ptr); + __m128i b_data = _mm_loadu_si128((__m128i*)(ptr + distance)); + __m128i result = _mm_cmpeq_epi16(a_data, b_data); + uint mask = ~_mm_movemask_epi8(result); + if (ushort(mask)) { + // found a different byte + uint idx = uint(_bit_scan_forward(mask)); + return reinterpret_cast<const QChar *>(ptr + idx)->unicode() + - reinterpret_cast<const QChar *>(ptr + distance + idx)->unicode(); + } + } +# ifdef Q_COMPILER_LAMBDA + const auto &lambda = [=](int i) -> int { + return reinterpret_cast<const QChar *>(ptr)[i].unicode() + - reinterpret_cast<const QChar *>(ptr + distance)[i].unicode(); + }; + return UnrollTailLoop<7>::exec(l, 0, lambda, lambda); +# endif +#endif + if (!l) return 0; - int l = qMin(alen, blen); - int cmp = ucstrncmp(a, b, l); - return cmp ? cmp : (alen-blen); -} - -// Unicode case-insensitive compare two same-sized strings -static int ucstrnicmp(const ushort *a, const ushort *b, int l) -{ - return ucstricmp(a, a + l, b, b + l); -} - -// Benchmarking indicates that doing memcmp is much slower than -// executing the comparison ourselves. -// -// The profiling was done on a population of calls to qMemEquals, generated -// during a run of the demo browser. The profile of the data (32-bit x86 -// Linux) was: -// -// total number of comparisons: 21353 -// longest string compared: 95 -// average comparison length: 14.8786 -// cache-line crosses: 5661 (13.3%) -// alignment histogram: -// 0xXXX0 = 512 (1.2%) strings, 0 (0.0%) of which same-aligned -// 0xXXX2 = 15087 (35.3%) strings, 5145 (34.1%) of which same-aligned -// 0xXXX4 = 525 (1.2%) strings, 0 (0.0%) of which same-aligned -// 0xXXX6 = 557 (1.3%) strings, 6 (1.1%) of which same-aligned -// 0xXXX8 = 509 (1.2%) strings, 0 (0.0%) of which same-aligned -// 0xXXXa = 24358 (57.0%) strings, 9901 (40.6%) of which same-aligned -// 0xXXXc = 557 (1.3%) strings, 0 (0.0%) of which same-aligned -// 0xXXXe = 601 (1.4%) strings, 15 (2.5%) of which same-aligned -// total = 42706 (100%) strings, 15067 (35.3%) of which same-aligned -// -// 92% of the strings have alignment of 2 or 10, which is due to malloc on -// 32-bit Linux returning values aligned to 8 bytes, and offsetof(array, QString::Data) == 18. -// -// The profile on 64-bit will be different since offsetof(array, QString::Data) == 26. -// -// The benchmark results were, for a Core-i7 @ 2.67 GHz 32-bit, compiled with -O3 -funroll-loops: -// 16-bit loads only: 872,301 CPU ticks [Qt 4.5 / memcmp] -// 32- and 16-bit loads: 773,362 CPU ticks [Qt 4.6] -// SSE2 "movdqu" 128-bit loads: 618,736 CPU ticks -// SSE3 "lddqu" 128-bit loads: 619,954 CPU ticks -// SSSE3 "palignr" corrections: 852,147 CPU ticks -// SSE4.2 "pcmpestrm": 738,702 CPU ticks -// -// The same benchmark on an Atom N450 @ 1.66 GHz, is: -// 16-bit loads only: 2,185,882 CPU ticks -// 32- and 16-bit loads: 1,805,060 CPU ticks -// SSE2 "movdqu" 128-bit loads: 2,529,843 CPU ticks -// SSE3 "lddqu" 128-bit loads: 2,514,858 CPU ticks -// SSSE3 "palignr" corrections: 2,160,325 CPU ticks -// SSE4.2 not available -// -// The conclusion we reach is that alignment the SSE2 unaligned code can gain -// 20% improvement in performance in some systems, but suffers a penalty due -// to the unaligned loads on others. - -static bool qMemEquals(const quint16 *a, const quint16 *b, int length) -{ - if (a == b || !length) - return true; union { - const quint16 *w; + const QChar *w; const quint32 *d; quintptr value; } sa, sb; @@ -295,8 +477,8 @@ static bool qMemEquals(const quint16 *a, const quint16 *b, int length) // both addresses are not aligned to 4-bytes boundaries // compare the first character if (*sa.w != *sb.w) - return false; - --length; + return sa.w->unicode() - sb.w->unicode(); + --l; ++sa.w; ++sb.w; @@ -305,23 +487,128 @@ static bool qMemEquals(const quint16 *a, const quint16 *b, int length) // both addresses are 4-bytes aligned // do a fast 32-bit comparison - const quint32 *e = sa.d + (length >> 1); + const quint32 *e = sa.d + (l >> 1); for ( ; sa.d != e; ++sa.d, ++sb.d) { - if (*sa.d != *sb.d) - return false; + if (*sa.d != *sb.d) { + if (*sa.w != *sb.w) + return sa.w->unicode() - sb.w->unicode(); + return sa.w[1].unicode() - sb.w[1].unicode(); + } } // do we have a tail? - return (length & 1) ? *sa.w == *sb.w : true; + return (l & 1) ? sa.w->unicode() - sb.w->unicode() : 0; } else { // one of the addresses isn't 4-byte aligned but the other is - const quint16 *e = sa.w + length; + const QChar *e = sa.w + l; for ( ; sa.w != e; ++sa.w, ++sb.w) { if (*sa.w != *sb.w) - return false; + return sa.w->unicode() - sb.w->unicode(); } } - return true; + return 0; +} + +static int ucstrncmp(const QChar *a, const uchar *c, int l) +{ + const ushort *uc = reinterpret_cast<const ushort *>(a); + const ushort *e = uc + l; + +#ifdef __SSE2__ + __m128i nullmask = _mm_setzero_si128(); + qptrdiff offset = 0; + + // we're going to read uc[offset..offset+15] (32 bytes) + // and c[offset..offset+15] (16 bytes) + for ( ; uc + offset + 15 < e; offset += 16) { + // similar to fromLatin1_helper: + // load Latin 1 data and expand to UTF-16 + __m128i chunk = _mm_loadu_si128((__m128i*)(c + offset)); + __m128i firstHalf = _mm_unpacklo_epi8(chunk, nullmask); + __m128i secondHalf = _mm_unpackhi_epi8(chunk, nullmask); + + // load UTF-16 data and compare + __m128i ucdata1 = _mm_loadu_si128((__m128i*)(uc + offset)); + __m128i ucdata2 = _mm_loadu_si128((__m128i*)(uc + offset + 8)); + __m128i result1 = _mm_cmpeq_epi16(firstHalf, ucdata1); + __m128i result2 = _mm_cmpeq_epi16(secondHalf, ucdata2); + + uint mask = ~(_mm_movemask_epi8(result1) | _mm_movemask_epi8(result2) << 16); + if (mask) { + // found a different character + uint idx = uint(_bit_scan_forward(mask)); + return uc[offset + idx / 2] - c[offset + idx / 2]; + } + } + + // we'll read uc[offset..offset+7] (16 bytes) and c[offset-8..offset+7] (16 bytes) + if (uc + offset + 7 < e) { + // same, but we'll throw away half the data + __m128i chunk = _mm_loadu_si128((__m128i*)(c + offset - 8)); + __m128i secondHalf = _mm_unpackhi_epi8(chunk, nullmask); + + __m128i ucdata = _mm_loadu_si128((__m128i*)(uc + offset)); + __m128i result = _mm_cmpeq_epi16(secondHalf, ucdata); + uint mask = ~_mm_movemask_epi8(result); + if (ushort(mask)) { + // found a different character + uint idx = uint(_bit_scan_forward(mask)); + return uc[offset + idx / 2] - c[offset + idx / 2]; + } + + // still matched + offset += 8; + } + + // reset uc and c + uc += offset; + c += offset; + +# ifdef Q_COMPILER_LAMBDA + const auto &lambda = [=](int i) { return uc[i] - ushort(c[i]); }; + return UnrollTailLoop<7>::exec(e - uc, 0, lambda, lambda); +# endif +#endif + + while (uc < e) { + int diff = *uc - *c; + if (diff) + return diff; + uc++, c++; + } + + return 0; +} + +// Unicode case-sensitive comparison +static int ucstrcmp(const QChar *a, int alen, const QChar *b, int blen) +{ + if (a == b && alen == blen) + return 0; + int l = qMin(alen, blen); + int cmp = ucstrncmp(a, b, l); + return cmp ? cmp : (alen-blen); +} + +// Unicode case-insensitive compare two same-sized strings +static int ucstrnicmp(const ushort *a, const ushort *b, int l) +{ + return ucstricmp(a, a + l, b, b + l); +} + +static bool qMemEquals(const quint16 *a, const quint16 *b, int length) +{ + if (a == b || !length) + return true; + + return ucstrncmp(reinterpret_cast<const QChar *>(a), reinterpret_cast<const QChar *>(b), length) == 0; +} + +static int ucstrcmp(const QChar *a, int alen, const uchar *b, int blen) +{ + int l = qMin(alen, blen); + int cmp = ucstrncmp(a, b, l); + return cmp ? cmp : (alen-blen); } /*! @@ -340,14 +627,38 @@ static int findChar(const QChar *str, int len, QChar ch, int from, if (from < 0) from = qMax(from + len, 0); if (from < len) { - const ushort *n = s + from - 1; + const ushort *n = s + from; const ushort *e = s + len; if (cs == Qt::CaseSensitive) { +#ifdef __SSE2__ + __m128i mch = _mm_set1_epi32(c | (c << 16)); + + // we're going to read n[0..7] (16 bytes) + for (const ushort *next = n + 8; next <= e; n = next, next += 8) { + __m128i data = _mm_loadu_si128((__m128i*)n); + __m128i result = _mm_cmpeq_epi16(data, mch); + uint mask = _mm_movemask_epi8(result); + if (ushort(mask)) { + // found a match + // same as: return n - s + _bit_scan_forward(mask) / 2 + return (reinterpret_cast<const char *>(n) - reinterpret_cast<const char *>(s) + + _bit_scan_forward(mask)) >> 1; + } + } + +# ifdef Q_COMPILER_LAMBDA + return UnrollTailLoop<7>::exec(e - n, -1, + [=](int i) { return n[i] == c; }, + [=](int i) { return n - s + i; }); +# endif +#endif + --n; while (++n != e) if (*n == c) return n - s; } else { c = foldCase(c); + --n; while (++n != e) if (foldCase(*n) == c) return n - s; @@ -1463,7 +1774,7 @@ QString &QString::operator=(QChar ch) */ QString &QString::insert(int i, QLatin1String str) { - const uchar *s = (const uchar *)str.latin1(); + const char *s = str.latin1(); if (i < 0 || !s || !(*s)) return *this; @@ -1471,8 +1782,7 @@ QString &QString::insert(int i, QLatin1String str) expand(qMax(d->size, i) + len - 1); ::memmove(d->data() + i + len, d->data() + i, (d->size - i - len) * sizeof(QChar)); - for (int j = 0; j < len; ++j) - d->data()[i + j] = s[j]; + qt_from_latin1(d->data() + i, s, uint(len)); return *this; } @@ -1584,14 +1894,14 @@ QString &QString::append(const QChar *str, int len) */ QString &QString::append(QLatin1String str) { - const uchar *s = (const uchar *)str.latin1(); + const char *s = str.latin1(); if (s) { int len = str.size(); if (d->ref.isShared() || uint(d->size + len) + 1u > d->alloc) reallocData(uint(d->size + len) + 1u, true); ushort *i = d->data() + d->size; - while ((*i++ = *s++)) - ; + qt_from_latin1(i, s, uint(len)); + i[len] = '\0'; d->size += len; } return *this; @@ -2098,13 +2408,11 @@ QString& QString::replace(QChar before, QChar after, Qt::CaseSensitivity cs) QString &QString::replace(QLatin1String before, QLatin1String after, Qt::CaseSensitivity cs) { int alen = after.size(); - QVarLengthArray<ushort> a(alen); - for (int i = 0; i < alen; ++i) - a[i] = (uchar)after.latin1()[i]; int blen = before.size(); + QVarLengthArray<ushort> a(alen); QVarLengthArray<ushort> b(blen); - for (int i = 0; i < blen; ++i) - b[i] = (uchar)before.latin1()[i]; + qt_from_latin1(a.data(), after.latin1(), alen); + qt_from_latin1(b.data(), before.latin1(), blen); return replace((const QChar *)b.data(), blen, (const QChar *)a.data(), alen, cs); } @@ -2124,8 +2432,7 @@ QString &QString::replace(QLatin1String before, const QString &after, Qt::CaseSe { int blen = before.size(); QVarLengthArray<ushort> b(blen); - for (int i = 0; i < blen; ++i) - b[i] = (uchar)before.latin1()[i]; + qt_from_latin1(b.data(), before.latin1(), blen); return replace((const QChar *)b.data(), blen, after.constData(), after.d->size, cs); } @@ -2145,8 +2452,7 @@ QString &QString::replace(const QString &before, QLatin1String after, Qt::CaseSe { int alen = after.size(); QVarLengthArray<ushort> a(alen); - for (int i = 0; i < alen; ++i) - a[i] = (uchar)after.latin1()[i]; + qt_from_latin1(a.data(), after.latin1(), alen); return replace(before.constData(), before.d->size, (const QChar *)a.data(), alen, cs); } @@ -2166,8 +2472,7 @@ QString &QString::replace(QChar c, QLatin1String after, Qt::CaseSensitivity cs) { int alen = after.size(); QVarLengthArray<ushort> a(alen); - for (int i = 0; i < alen; ++i) - a[i] = (uchar)after.latin1()[i]; + qt_from_latin1(a.data(), after.latin1(), alen); return replace(&c, 1, (const QChar *)a.data(), alen, cs); } @@ -2201,17 +2506,7 @@ bool QString::operator==(QLatin1String other) const if (!other.size()) return isEmpty(); - const ushort *uc = d->data(); - const ushort *e = uc + d->size; - const uchar *c = (uchar *)other.latin1(); - - while (uc < e) { - if (*uc != *c) - return false; - ++uc; - ++c; - } - return true; + return compare_helper(data(), size(), other, Qt::CaseSensitive) == 0; } /*! \fn bool QString::operator==(const QByteArray &other) const @@ -2265,16 +2560,7 @@ bool QString::operator<(QLatin1String other) const if (!c || *c == 0) return false; - const ushort *uc = d->data(); - const ushort *e = uc + qMin(d->size, other.size()); - - while (uc < e) { - if (*uc != *c) - break; - ++uc; - ++c; - } - return (uc == e ? d->size < other.size() : *uc < *c); + return compare_helper(data(), size(), other, Qt::CaseSensitive) < 0; } /*! \fn bool QString::operator<(const QByteArray &other) const @@ -2367,16 +2653,7 @@ bool QString::operator>(QLatin1String other) const if (!c || *c == '\0') return !isEmpty(); - const ushort *uc = d->data(); - const ushort *e = uc + qMin(d->size, other.size()); - - while (uc < e) { - if (*uc != *c) - break; - ++uc; - ++c; - } - return (uc == e) ? d->size > other.size() : *uc > *c; + return compare_helper(data(), size(), other, Qt::CaseSensitive) > 0; } /*! \fn bool QString::operator>(const QByteArray &other) const @@ -2763,8 +3040,7 @@ int QString::lastIndexOf(QLatin1String str, int from, Qt::CaseSensitivity cs) co from = delta; QVarLengthArray<ushort> s(sl); - for (int i = 0; i < sl; ++i) - s[i] = str.latin1()[i]; + qt_from_latin1(s.data(), str.latin1(), sl); return lastIndexOfHelper(d->data(), from, s.data(), sl, cs); } @@ -3172,6 +3448,15 @@ int QString::count(const QStringRef &str, Qt::CaseSensitivity cs) const \sa indexOf(), count() */ +/*! \fn bool QString::contains(QLatin1String str, Qt::CaseSensitivity cs = Qt::CaseSensitive) const + \since 5.3 + + \overload contains() + + Returns \c true if this string contains an occurrence of the latin-1 string + \a str; otherwise returns \c false. +*/ + /*! \fn bool QString::contains(QChar ch, Qt::CaseSensitivity cs = Qt::CaseSensitive) const \overload contains() @@ -3895,131 +4180,58 @@ bool QString::endsWith(QChar c, Qt::CaseSensitivity cs) const : foldCase(d->data()[d->size - 1]) == foldCase(c.unicode())); } - -#if defined(__SSE2__) -static inline __m128i mergeQuestionMarks(__m128i chunk) +QByteArray QString::toLatin1_helper(const QString &string) { - const __m128i questionMark = _mm_set1_epi16('?'); - -# ifdef __SSE4_2__ - // compare the unsigned shorts for the range 0x0100-0xFFFF - // note on the use of _mm_cmpestrm: - // The MSDN documentation online (http://technet.microsoft.com/en-us/library/bb514080.aspx) - // says for range search the following: - // For each character c in a, determine whether b0 <= c <= b1 or b2 <= c <= b3 - // - // However, all examples on the Internet, including from Intel - // (see http://software.intel.com/en-us/articles/xml-parsing-accelerator-with-intel-streaming-simd-extensions-4-intel-sse4/) - // put the range to be searched first - // - // Disassembly and instruction-level debugging with GCC and ICC show - // that they are doing the right thing. Inverting the arguments in the - // instruction does cause a bunch of test failures. - - const int mode = _SIDD_UWORD_OPS | _SIDD_CMP_RANGES | _SIDD_UNIT_MASK; - const __m128i rangeMatch = _mm_cvtsi32_si128(0xffff0100); - const __m128i offLimitMask = _mm_cmpestrm(rangeMatch, 2, chunk, 8, mode); - - // replace the non-Latin 1 characters in the chunk with question marks - chunk = _mm_blendv_epi8(chunk, questionMark, offLimitMask); -# else - // SSE has no compare instruction for unsigned comparison. - // The variables must be shiffted + 0x8000 to be compared - const __m128i signedBitOffset = _mm_set1_epi16(short(0x8000)); - const __m128i thresholdMask = _mm_set1_epi16(short(0xff + 0x8000)); - - const __m128i signedChunk = _mm_add_epi16(chunk, signedBitOffset); - const __m128i offLimitMask = _mm_cmpgt_epi16(signedChunk, thresholdMask); + if (Q_UNLIKELY(string.isNull())) + return QByteArray(); -# ifdef __SSE4_1__ - // replace the non-Latin 1 characters in the chunk with question marks - chunk = _mm_blendv_epi8(chunk, questionMark, offLimitMask); -# else - // offLimitQuestionMark contains '?' for each 16 bits that was off-limit - // the 16 bits that were correct contains zeros - const __m128i offLimitQuestionMark = _mm_and_si128(offLimitMask, questionMark); + return toLatin1_helper(string.constData(), string.length()); +} - // correctBytes contains the bytes that were in limit - // the 16 bits that were off limits contains zeros - const __m128i correctBytes = _mm_andnot_si128(offLimitMask, chunk); +QByteArray QString::toLatin1_helper(const QChar *data, int length) +{ + QByteArray ba(length, Qt::Uninitialized); - // merge offLimitQuestionMark and correctBytes to have the result - chunk = _mm_or_si128(correctBytes, offLimitQuestionMark); -# endif -# endif - return chunk; + // since we own the only copy, we're going to const_cast the constData; + // that avoids an unnecessary call to detach() and expansion code that will never get used + qt_to_latin1(reinterpret_cast<uchar *>(const_cast<char *>(ba.constData())), + reinterpret_cast<const ushort *>(data), length); + return ba; } -#endif - -#if defined(__mips_dsp) -extern "C" void qt_toLatin1_mips_dsp_asm(uchar *dst, const ushort *src, int length); -#endif -static QByteArray toLatin1_helper(const QChar *data, int length) +QByteArray QString::toLatin1_helper_inplace(QString &s) { - QByteArray ba; - if (length) { - ba.resize(length); - const ushort *src = reinterpret_cast<const ushort *>(data); - uchar *dst = (uchar*) ba.data(); -#if defined(__SSE2__) - if (length >= 16) { - const int chunkCount = length >> 4; // divided by 16 + if (!s.isDetached()) + return s.toLatin1(); - for (int i = 0; i < chunkCount; ++i) { - __m128i chunk1 = _mm_loadu_si128((__m128i*)src); // load - chunk1 = mergeQuestionMarks(chunk1); - src += 8; + // We can return our own buffer to the caller. + // Conversion to Latin-1 always shrinks the buffer by half. + const ushort *data = reinterpret_cast<const ushort *>(s.constData()); + uint length = s.size(); - __m128i chunk2 = _mm_loadu_si128((__m128i*)src); // load - chunk2 = mergeQuestionMarks(chunk2); - src += 8; + // Swap the d pointers. + // Kids, avert your eyes. Don't try this at home. + QArrayData *ba_d = s.d; - // pack the two vector to 16 x 8bits elements - const __m128i result = _mm_packus_epi16(chunk1, chunk2); + // multiply the allocated capacity by sizeof(ushort) + ba_d->alloc *= sizeof(ushort); - _mm_storeu_si128((__m128i*)dst, result); // store - dst += 16; - } - length = length % 16; - } -#elif defined(__ARM_NEON__) - // Refer to the documentation of the SSE2 implementation - // this use eactly the same method as for SSE except: - // 1) neon has unsigned comparison - // 2) packing is done to 64 bits (8 x 8bits component). - if (length >= 16) { - const int chunkCount = length >> 3; // divided by 8 - const uint16x8_t questionMark = vdupq_n_u16('?'); // set - const uint16x8_t thresholdMask = vdupq_n_u16(0xff); // set - for (int i = 0; i < chunkCount; ++i) { - uint16x8_t chunk = vld1q_u16((uint16_t *)src); // load - src += 8; + // reset ourselves to QString() + s.d = QString().d; - const uint16x8_t offLimitMask = vcgtq_u16(chunk, thresholdMask); // chunk > thresholdMask - const uint16x8_t offLimitQuestionMark = vandq_u16(offLimitMask, questionMark); // offLimitMask & questionMark - const uint16x8_t correctBytes = vbicq_u16(chunk, offLimitMask); // !offLimitMask & chunk - chunk = vorrq_u16(correctBytes, offLimitQuestionMark); // correctBytes | offLimitQuestionMark - const uint8x8_t result = vmovn_u16(chunk); // narrowing move->packing - vst1_u8(dst, result); // store - dst += 8; - } - length = length % 8; - } -#endif -#if defined(__mips_dsp) - qt_toLatin1_mips_dsp_asm(dst, src, length); -#else - while (length--) { - *dst++ = (*src>0xff) ? '?' : (uchar) *src; - ++src; - } -#endif - } - return ba; + // do the in-place conversion + uchar *dst = reinterpret_cast<uchar *>(ba_d->data()); + qt_to_latin1(dst, data, length); + dst[length] = '\0'; + + QByteArrayDataPtr badptr = { ba_d }; + return QByteArray(badptr); } + /*! + \fn QByteArray QString::toLatin1() const + Returns a Latin-1 representation of the string as a QByteArray. The returned byte array is undefined if the string contains non-Latin1 @@ -4028,10 +4240,6 @@ static QByteArray toLatin1_helper(const QChar *data, int length) \sa fromLatin1(), toUtf8(), toLocal8Bit(), QTextCodec */ -QByteArray QString::toLatin1() const -{ - return toLatin1_helper(unicode(), length()); -} /*! \fn QByteArray QString::toAscii() const @@ -4046,19 +4254,9 @@ QByteArray QString::toLatin1() const \sa fromAscii(), toLatin1(), toUtf8(), toLocal8Bit(), QTextCodec */ -#if !defined(Q_OS_MAC) && defined(Q_OS_UNIX) && !defined(QT_USE_ICU) -static QByteArray toLocal8Bit_helper(const QChar *data, int length) -{ -#ifndef QT_NO_TEXTCODEC - QTextCodec *localeCodec = QTextCodec::codecForLocale(); - if (localeCodec) - return localeCodec->fromUnicode(data, length); -#endif // QT_NO_TEXTCODEC - return toLatin1_helper(data, length); -} -#endif - /*! + \fn QByteArray QString::toLocal8Bit() const + Returns the local 8-bit representation of the string as a QByteArray. The returned byte array is undefined if the string contains characters not supported by the local 8-bit encoding. @@ -4073,17 +4271,21 @@ static QByteArray toLocal8Bit_helper(const QChar *data, int length) \sa fromLocal8Bit(), toLatin1(), toUtf8(), QTextCodec */ -QByteArray QString::toLocal8Bit() const + +QByteArray QString::toLocal8Bit_helper(const QChar *data, int size) { #ifndef QT_NO_TEXTCODEC QTextCodec *localeCodec = QTextCodec::codecForLocale(); if (localeCodec) - return localeCodec->fromUnicode(*this); + return localeCodec->fromUnicode(data, size); #endif // QT_NO_TEXTCODEC - return toLatin1(); + return toLatin1_helper(data, size); } + /*! + \fn QByteArray QString::toUtf8() const + Returns a UTF-8 representation of the string as a QByteArray. UTF-8 is a Unicode codec and can represent all characters in a Unicode @@ -4099,12 +4301,13 @@ QByteArray QString::toLocal8Bit() const \sa fromUtf8(), toLatin1(), toLocal8Bit(), QTextCodec */ -QByteArray QString::toUtf8() const + +QByteArray QString::toUtf8_helper(const QString &str) { - if (isNull()) + if (str.isNull()) return QByteArray(); - return QUtf8::convertFromUnicode(constData(), length(), 0); + return QUtf8::convertFromUnicode(str.constData(), str.length()); } /*! @@ -4126,12 +4329,6 @@ QVector<uint> QString::toUcs4() const return v; } -#if defined(__mips_dsp) -// From qstring_mips_dsp_asm.S -extern "C" void qt_fromlatin1_mips_asm_unroll4 (ushort*, const char*, uint); -extern "C" void qt_fromlatin1_mips_asm_unroll8 (ushort*, const char*, uint); -#endif - QString::Data *QString::fromLatin1_helper(const char *str, int size) { Data *d; @@ -4147,40 +4344,8 @@ QString::Data *QString::fromLatin1_helper(const char *str, int size) d->size = size; d->data()[size] = '\0'; ushort *dst = d->data(); - /* SIMD: - * Unpacking with SSE has been shown to improve performance on recent CPUs - * The same method gives no improvement with NEON. - */ -#if defined(__SSE2__) - if (size >= 16) { - int chunkCount = size >> 4; // divided by 16 - const __m128i nullMask = _mm_set1_epi32(0); - for (int i = 0; i < chunkCount; ++i) { - const __m128i chunk = _mm_loadu_si128((__m128i*)str); // load - str += 16; - // unpack the first 8 bytes, padding with zeros - const __m128i firstHalf = _mm_unpacklo_epi8(chunk, nullMask); - _mm_storeu_si128((__m128i*)dst, firstHalf); // store - dst += 8; - - // unpack the last 8 bytes, padding with zeros - const __m128i secondHalf = _mm_unpackhi_epi8 (chunk, nullMask); - _mm_storeu_si128((__m128i*)dst, secondHalf); // store - dst += 8; - } - size = size % 16; - } -#endif -#if defined(__mips_dsp) - if (size > 20) - qt_fromlatin1_mips_asm_unroll8(dst, str, size); - else - qt_fromlatin1_mips_asm_unroll4(dst, str, size); -#else - while (size--) - *dst++ = (uchar)*str++; -#endif + qt_from_latin1(dst, str, uint(size)); } return d; } @@ -4305,7 +4470,7 @@ QString QString::fromUtf8_helper(const char *str, int size) return QString(); Q_ASSERT(size != -1); - return QUtf8::convertToUnicode(str, size, 0); + return QUtf8::convertToUnicode(str, size); } /*! @@ -5039,22 +5204,7 @@ int QString::compare_helper(const QChar *data1, int length1, QLatin1String s2, return length1; if (cs == Qt::CaseSensitive) { - const ushort *e = uc + length1; - if (s2.size() < length1) - e = uc + s2.size(); - while (uc < e) { - int diff = *uc - *c; - if (diff) - return diff; - uc++, c++; - } - - if (uc == uce) { - if (c == (const uchar *)s2.latin1() + s2.size()) - return 0; - return -1; - } - return 1; + return ucstrcmp(data1, length1, c, s2.size()); } else { return ucstricmp(uc, uce, c, c + s2.size()); } @@ -5144,7 +5294,11 @@ int QString::localeAwareCompare_helper(const QChar *data1, int length1, return ucstrcmp(data1, length1, data2, length2); #if defined(Q_OS_WIN32) || defined(Q_OS_WINCE) +#ifndef Q_OS_WINRT int res = CompareString(GetUserDefaultLCID(), 0, (wchar_t*)data1, length1, (wchar_t*)data2, length2); +#else + int res = CompareStringEx(LOCALE_NAME_USER_DEFAULT, 0, (LPCWSTR)data1, length1, (LPCWSTR)data2, length2, NULL, NULL, 0); +#endif switch (res) { case CSTR_LESS_THAN: @@ -8254,19 +8408,10 @@ bool operator==(QLatin1String s1, const QStringRef &s2) if (s1.size() != s2.size()) return false; - const ushort *uc = reinterpret_cast<const ushort *>(s2.unicode()); - const ushort *e = uc + s2.size(); const uchar *c = reinterpret_cast<const uchar *>(s1.latin1()); if (!c) return s2.isEmpty(); - - while (*c) { - if (uc == e || *uc != *c) - return false; - ++uc; - ++c; - } - return (uc == e); + return ucstrncmp(s2.unicode(), c, s2.size()) == 0; } /*! @@ -8854,8 +8999,7 @@ int QStringRef::lastIndexOf(QLatin1String str, int from, Qt::CaseSensitivity cs) from = delta; QVarLengthArray<ushort> s(sl); - for (int i = 0; i < sl; ++i) - s[i] = str.latin1()[i]; + qt_from_latin1(s.data(), str.latin1(), sl); return lastIndexOfHelper(reinterpret_cast<const ushort*>(unicode()), from, s.data(), sl, cs); } @@ -9193,8 +9337,7 @@ static inline int qt_find_latin1_string(const QChar *haystack, int size, const char *latin1 = needle.latin1(); int len = needle.size(); QVarLengthArray<ushort> s(len); - for (int i = 0; i < len; ++i) - s[i] = latin1[i]; + qt_from_latin1(s.data(), latin1, len); return qFindString(haystack, size, from, reinterpret_cast<const QChar*>(s.constData()), len, cs); @@ -9238,9 +9381,7 @@ static inline bool qt_starts_with(const QChar *haystack, int haystackLen, const ushort *data = reinterpret_cast<const ushort*>(haystack); const uchar *latin = reinterpret_cast<const uchar*>(needle.latin1()); if (cs == Qt::CaseSensitive) { - for (int i = 0; i < slen; ++i) - if (data[i] != latin[i]) - return false; + return ucstrncmp(haystack, latin, slen) == 0; } else { for (int i = 0; i < slen; ++i) if (foldCase(data[i]) != foldCase((ushort)latin[i])) @@ -9290,9 +9431,7 @@ static inline bool qt_ends_with(const QChar *haystack, int haystackLen, const uchar *latin = reinterpret_cast<const uchar*>(needle.latin1()); const ushort *data = reinterpret_cast<const ushort*>(haystack); if (cs == Qt::CaseSensitive) { - for (int i = 0; i < slen; i++) - if (data[pos+i] != latin[i]) - return false; + return ucstrncmp(haystack + pos, latin, slen) == 0; } else { for (int i = 0; i < slen; i++) if (foldCase(data[pos+i]) != foldCase((ushort)latin[i])) @@ -9314,7 +9453,7 @@ static inline bool qt_ends_with(const QChar *haystack, int haystackLen, */ QByteArray QStringRef::toLatin1() const { - return toLatin1_helper(unicode(), length()); + return QString::toLatin1_helper(unicode(), length()); } /*! |