QString::{to,from}Latin1: add the ability to do overlapping tails

If the string length is larger than the number of characters we can operate on with a single vector loop, we can transform the tail using a vector too, just overlapping up to 15 characters with the last iteration o the loop. Change-Id: Ib42b3adc93bf4d43bd55fffd16c2dcab115e50f7 Reviewed-by: Lars Knoll <lars@knoll.priv.no>
author: Thiago Macieira <thiago.macieira@intel.com> 2021-12-21 16:44:01 -0300
committer: Thiago Macieira <thiago.macieira@intel.com> 2022-12-04 17:56:45 -0800
commit: 2b9d4afc95a6e716f7bb1839df4041e454aa52af (patch)
tree: de3947dfc8175b8a3851107d03e27230846eaff5 /src/corelib
parent: 3b528670e6d5cbe25e0892b484b3e93417e263d3 (diff)
1 files changed, 31 insertions, 13 deletions
diff --git a/src/corelib/text/qstring.cpp b/src/corelib/text/qstring.cpp
index 90e791d0d3..f41e9377eb 100644
--- a/src/corelib/text/qstring.cpp
+++ b/src/corelib/text/qstring.cpp
@@ -803,11 +803,8 @@ Q_CORE_EXPORT void qt_from_latin1(char16_t *dst, const char *str, size_t size) n
      * itself in exactly the same way as one would do it with intrinsics.
      */
 #if defined(__SSE2__)
-    const char *e = str + size;
-    qptrdiff offset = 0;
-
     // we're going to read str[offset..offset+15] (16 bytes)
-    for ( ; str + offset + 15 < e; offset += 16) {
+    auto processOneChunk = [=](qptrdiff offset) {
         const __m128i chunk = _mm_loadu_si128((const __m128i*)(str + offset)); // load
         if constexpr (UseAvx2) {
             // zero extend to an YMM register
@@ -826,10 +823,21 @@ Q_CORE_EXPORT void qt_from_latin1(char16_t *dst, const char *str, size_t size) n
             const __m128i secondHalf = _mm_unpackhi_epi8 (chunk, nullMask);
             _mm_storeu_si128((__m128i*)(dst + offset + 8), secondHalf); // store
         }
+    };
+
+    const char *e = str + size;
+    qptrdiff offset = 0;
+    if (size >= sizeof(__m128i)) {
+        for ( ; str + offset + sizeof(__m128i) <= e; offset += sizeof(__m128i))
+            processOneChunk(offset);
+        if (str + offset < e)
+            processOneChunk(size - sizeof(__m128i));
+        return;
     }
 
+#  if !defined(__OPTIMIZE_SIZE__)
     // we're going to read str[offset..offset+7] (8 bytes)
-    if (str + offset + 7 < e) {
+    if (str + offset + 8 <= e) {
         const __m128i unpacked = mm_load8_zero_extend(str + offset);
         _mm_storeu_si128(reinterpret_cast<__m128i *>(dst + offset), unpacked);
         offset += 8;
@@ -838,7 +846,6 @@ Q_CORE_EXPORT void qt_from_latin1(char16_t *dst, const char *str, size_t size) n
     size = size % 8;
     dst += offset;
     str += offset;
-#  if !defined(__OPTIMIZE_SIZE__)
     return UnrollTailLoop<7>::exec(qsizetype(size), [=](qsizetype i) { dst[i] = (uchar)str[i]; });
 #  endif
 #endif
@@ -859,9 +866,6 @@ template <bool Checked>
 static void qt_to_latin1_internal(uchar *dst, const char16_t *src, qsizetype length)
 {
 #if defined(__SSE2__)
-    uchar *e = dst + length;
-    qptrdiff offset = 0;
-
     auto questionMark256 = []() {
         if constexpr (UseAvx2)
             return _mm256_broadcastw_epi16(_mm_cvtsi32_si128('?'));
@@ -917,8 +921,8 @@ static void qt_to_latin1_internal(uchar *dst, const char16_t *src, qsizetype len
         return chunk;
     };
 
-    // we're going to write to dst[offset..offset+15] (16 bytes)
-    for ( ; dst + offset + 15 < e; offset += 16) {
+    // we're going to read to src[offset..offset+15] (16 bytes)
+    auto loadChunkAt = [=](qptrdiff offset) {
         __m128i chunk1, chunk2;
         if constexpr (UseAvx2) {
             __m256i chunk = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src + offset));
@@ -940,8 +944,22 @@ static void qt_to_latin1_internal(uchar *dst, const char16_t *src, qsizetype len
         }
 
         // pack the two vector to 16 x 8bits elements
-        const __m128i result = _mm_packus_epi16(chunk1, chunk2);
-        _mm_storeu_si128((__m128i*)(dst + offset), result); // store
+        return _mm_packus_epi16(chunk1, chunk2);
+    };
+
+    uchar *e = dst + length;
+    qptrdiff offset = 0;
+    if (size_t(length) >= sizeof(__m128i)) {
+        // because of possible overlapping, we won't process the last chunk in the loop
+        for ( ; offset + 2 * sizeof(__m128i) < size_t(length); offset += sizeof(__m128i))
+            _mm_storeu_si128(reinterpret_cast<__m128i *>(dst + offset), loadChunkAt(offset));
+
+        // overlapped conversion of the last full chunk and the tail
+        __m128i last1 = loadChunkAt(offset);
+        __m128i last2 = loadChunkAt(length - sizeof(__m128i));
+        _mm_storeu_si128(reinterpret_cast<__m128i *>(dst + offset), last1);
+        _mm_storeu_si128(reinterpret_cast<__m128i *>(dst + length - sizeof(__m128i)), last2);
+        return;
     }
 
 #  if !defined(__OPTIMIZE_SIZE__)
author	Thiago Macieira <thiago.macieira@intel.com>	2021-12-21 16:44:01 -0300
committer	Thiago Macieira <thiago.macieira@intel.com>	2022-12-04 17:56:45 -0800
commit	2b9d4afc95a6e716f7bb1839df4041e454aa52af (patch)
tree	de3947dfc8175b8a3851107d03e27230846eaff5 /src/corelib
parent	3b528670e6d5cbe25e0892b484b3e93417e263d3 (diff)