diff options
author | Thiago Macieira <thiago.macieira@intel.com> | 2021-12-21 16:44:01 -0300 |
---|---|---|
committer | Thiago Macieira <thiago.macieira@intel.com> | 2022-12-04 17:56:45 -0800 |
commit | 2b9d4afc95a6e716f7bb1839df4041e454aa52af (patch) | |
tree | de3947dfc8175b8a3851107d03e27230846eaff5 /src/corelib | |
parent | 3b528670e6d5cbe25e0892b484b3e93417e263d3 (diff) |
QString::{to,from}Latin1: add the ability to do overlapping tails
If the string length is larger than the number of characters we can
operate on with a single vector loop, we can transform the tail using a
vector too, just overlapping up to 15 characters with the last iteration
o the loop.
Change-Id: Ib42b3adc93bf4d43bd55fffd16c2dcab115e50f7
Reviewed-by: Lars Knoll <lars@knoll.priv.no>
Diffstat (limited to 'src/corelib')
-rw-r--r-- | src/corelib/text/qstring.cpp | 44 |
1 files changed, 31 insertions, 13 deletions
diff --git a/src/corelib/text/qstring.cpp b/src/corelib/text/qstring.cpp index 90e791d0d3..f41e9377eb 100644 --- a/src/corelib/text/qstring.cpp +++ b/src/corelib/text/qstring.cpp @@ -803,11 +803,8 @@ Q_CORE_EXPORT void qt_from_latin1(char16_t *dst, const char *str, size_t size) n * itself in exactly the same way as one would do it with intrinsics. */ #if defined(__SSE2__) - const char *e = str + size; - qptrdiff offset = 0; - // we're going to read str[offset..offset+15] (16 bytes) - for ( ; str + offset + 15 < e; offset += 16) { + auto processOneChunk = [=](qptrdiff offset) { const __m128i chunk = _mm_loadu_si128((const __m128i*)(str + offset)); // load if constexpr (UseAvx2) { // zero extend to an YMM register @@ -826,10 +823,21 @@ Q_CORE_EXPORT void qt_from_latin1(char16_t *dst, const char *str, size_t size) n const __m128i secondHalf = _mm_unpackhi_epi8 (chunk, nullMask); _mm_storeu_si128((__m128i*)(dst + offset + 8), secondHalf); // store } + }; + + const char *e = str + size; + qptrdiff offset = 0; + if (size >= sizeof(__m128i)) { + for ( ; str + offset + sizeof(__m128i) <= e; offset += sizeof(__m128i)) + processOneChunk(offset); + if (str + offset < e) + processOneChunk(size - sizeof(__m128i)); + return; } +# if !defined(__OPTIMIZE_SIZE__) // we're going to read str[offset..offset+7] (8 bytes) - if (str + offset + 7 < e) { + if (str + offset + 8 <= e) { const __m128i unpacked = mm_load8_zero_extend(str + offset); _mm_storeu_si128(reinterpret_cast<__m128i *>(dst + offset), unpacked); offset += 8; @@ -838,7 +846,6 @@ Q_CORE_EXPORT void qt_from_latin1(char16_t *dst, const char *str, size_t size) n size = size % 8; dst += offset; str += offset; -# if !defined(__OPTIMIZE_SIZE__) return UnrollTailLoop<7>::exec(qsizetype(size), [=](qsizetype i) { dst[i] = (uchar)str[i]; }); # endif #endif @@ -859,9 +866,6 @@ template <bool Checked> static void qt_to_latin1_internal(uchar *dst, const char16_t *src, qsizetype length) { #if defined(__SSE2__) - uchar *e = dst + length; - qptrdiff offset = 0; - auto questionMark256 = []() { if constexpr (UseAvx2) return _mm256_broadcastw_epi16(_mm_cvtsi32_si128('?')); @@ -917,8 +921,8 @@ static void qt_to_latin1_internal(uchar *dst, const char16_t *src, qsizetype len return chunk; }; - // we're going to write to dst[offset..offset+15] (16 bytes) - for ( ; dst + offset + 15 < e; offset += 16) { + // we're going to read to src[offset..offset+15] (16 bytes) + auto loadChunkAt = [=](qptrdiff offset) { __m128i chunk1, chunk2; if constexpr (UseAvx2) { __m256i chunk = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src + offset)); @@ -940,8 +944,22 @@ static void qt_to_latin1_internal(uchar *dst, const char16_t *src, qsizetype len } // pack the two vector to 16 x 8bits elements - const __m128i result = _mm_packus_epi16(chunk1, chunk2); - _mm_storeu_si128((__m128i*)(dst + offset), result); // store + return _mm_packus_epi16(chunk1, chunk2); + }; + + uchar *e = dst + length; + qptrdiff offset = 0; + if (size_t(length) >= sizeof(__m128i)) { + // because of possible overlapping, we won't process the last chunk in the loop + for ( ; offset + 2 * sizeof(__m128i) < size_t(length); offset += sizeof(__m128i)) + _mm_storeu_si128(reinterpret_cast<__m128i *>(dst + offset), loadChunkAt(offset)); + + // overlapped conversion of the last full chunk and the tail + __m128i last1 = loadChunkAt(offset); + __m128i last2 = loadChunkAt(length - sizeof(__m128i)); + _mm_storeu_si128(reinterpret_cast<__m128i *>(dst + offset), last1); + _mm_storeu_si128(reinterpret_cast<__m128i *>(dst + length - sizeof(__m128i)), last2); + return; } # if !defined(__OPTIMIZE_SIZE__) |