diff options
author | Thiago Macieira <thiago.macieira@intel.com> | 2021-12-22 00:26:15 -0300 |
---|---|---|
committer | Thiago Macieira <thiago.macieira@intel.com> | 2022-12-04 17:56:45 -0800 |
commit | 339aa99fecce54aef8738d915185dca5ec193447 (patch) | |
tree | 42b65ca06bac4f494340a110b404ed079b6372cf /src/corelib | |
parent | 3ef43ca837b36cd0aefc925ea08234395dcf49e2 (diff) |
QString::toLatin1: do the same as fromLatin1 for sub-16 characters
Perform a pair of overlapped loads & stores, so we can perform the
contracting and inserting of question marks in vector code.
Change-Id: Ib42b3adc93bf4d43bd55fffd16c2f5e479b0f6fa
Reviewed-by: Lars Knoll <lars@knoll.priv.no>
Diffstat (limited to 'src/corelib')
-rw-r--r-- | src/corelib/text/qstring.cpp | 59 |
1 files changed, 29 insertions, 30 deletions
diff --git a/src/corelib/text/qstring.cpp b/src/corelib/text/qstring.cpp index bce00325b9..ce288b030d 100644 --- a/src/corelib/text/qstring.cpp +++ b/src/corelib/text/qstring.cpp @@ -955,10 +955,9 @@ static void qt_to_latin1_internal(uchar *dst, const char16_t *src, qsizetype len return _mm_packus_epi16(chunk1, chunk2); }; - uchar *e = dst + length; - qptrdiff offset = 0; if (size_t(length) >= sizeof(__m128i)) { // because of possible overlapping, we won't process the last chunk in the loop + qptrdiff offset = 0; for ( ; offset + 2 * sizeof(__m128i) < size_t(length); offset += sizeof(__m128i)) _mm_storeu_si128(reinterpret_cast<__m128i *>(dst + offset), loadChunkAt(offset)); @@ -971,45 +970,45 @@ static void qt_to_latin1_internal(uchar *dst, const char16_t *src, qsizetype len } # if !defined(__OPTIMIZE_SIZE__) - // we're going to write to dst[offset..offset+7] (8 bytes) - if (dst + offset + 7 < e) { - __m128i chunk = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src + offset)); - chunk = mergeQuestionMarks(chunk); - - // pack, where the upper half is ignored - const __m128i result = _mm_packus_epi16(chunk, chunk); - _mm_storel_epi64(reinterpret_cast<__m128i *>(dst + offset), result); - offset += 8; - } + if (length >= 4) { + // this code is fine even for in-place conversion because we load both + // before any store + if (length >= 8) { + __m128i chunk1 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src)); + __m128i chunk2 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src + length - 8)); + chunk1 = mergeQuestionMarks(chunk1); + chunk2 = mergeQuestionMarks(chunk2); - // we're going to write to dst[offset..offset+3] (4 bytes) - if (dst + offset + 3 < e) { - __m128i chunk = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(src + offset)); - chunk = mergeQuestionMarks(chunk); + // pack, where the upper half is ignored + const __m128i result1 = _mm_packus_epi16(chunk1, chunk1); + const __m128i result2 = _mm_packus_epi16(chunk2, chunk2); + _mm_storel_epi64(reinterpret_cast<__m128i *>(dst), result1); + _mm_storel_epi64(reinterpret_cast<__m128i *>(dst + length - 8), result2); + } else { + __m128i chunk1 = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(src)); + __m128i chunk2 = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(src + length - 4)); + chunk1 = mergeQuestionMarks(chunk1); + chunk2 = mergeQuestionMarks(chunk2); - // pack, we'll the upper three quarters - const __m128i result = _mm_packus_epi16(chunk, chunk); - qToUnaligned(_mm_cvtsi128_si32(result), dst + offset); - offset += 4; + // pack, we'll zero the upper three quarters + const __m128i result1 = _mm_packus_epi16(chunk1, chunk1); + const __m128i result2 = _mm_packus_epi16(chunk2, chunk2); + qToUnaligned(_mm_cvtsi128_si32(result1), dst); + qToUnaligned(_mm_cvtsi128_si32(result2), dst + length - 4); + } + return; } length = length % 4; -# else - length = length % 16; -# endif // optimize size - - // advance dst, src for tail processing - dst += offset; - src += offset; - -# if !defined(__OPTIMIZE_SIZE__) return UnrollTailLoop<3>::exec(length, [=](qsizetype i) { if (Checked) dst[i] = (src[i]>0xff) ? '?' : (uchar) src[i]; else dst[i] = src[i]; }); -# endif +# else + length = length % 16; +# endif // optimize size #elif defined(__ARM_NEON__) // Refer to the documentation of the SSE2 implementation. // This uses exactly the same method as for SSE except: |