summaryrefslogtreecommitdiffstats
path: root/src/corelib
diff options
context:
space:
mode:
authorThiago Macieira <thiago.macieira@intel.com>2021-12-22 00:26:15 -0300
committerThiago Macieira <thiago.macieira@intel.com>2022-12-04 17:56:45 -0800
commit339aa99fecce54aef8738d915185dca5ec193447 (patch)
tree42b65ca06bac4f494340a110b404ed079b6372cf /src/corelib
parent3ef43ca837b36cd0aefc925ea08234395dcf49e2 (diff)
QString::toLatin1: do the same as fromLatin1 for sub-16 characters
Perform a pair of overlapped loads & stores, so we can perform the contracting and inserting of question marks in vector code. Change-Id: Ib42b3adc93bf4d43bd55fffd16c2f5e479b0f6fa Reviewed-by: Lars Knoll <lars@knoll.priv.no>
Diffstat (limited to 'src/corelib')
-rw-r--r--src/corelib/text/qstring.cpp59
1 files changed, 29 insertions, 30 deletions
diff --git a/src/corelib/text/qstring.cpp b/src/corelib/text/qstring.cpp
index bce00325b9..ce288b030d 100644
--- a/src/corelib/text/qstring.cpp
+++ b/src/corelib/text/qstring.cpp
@@ -955,10 +955,9 @@ static void qt_to_latin1_internal(uchar *dst, const char16_t *src, qsizetype len
return _mm_packus_epi16(chunk1, chunk2);
};
- uchar *e = dst + length;
- qptrdiff offset = 0;
if (size_t(length) >= sizeof(__m128i)) {
// because of possible overlapping, we won't process the last chunk in the loop
+ qptrdiff offset = 0;
for ( ; offset + 2 * sizeof(__m128i) < size_t(length); offset += sizeof(__m128i))
_mm_storeu_si128(reinterpret_cast<__m128i *>(dst + offset), loadChunkAt(offset));
@@ -971,45 +970,45 @@ static void qt_to_latin1_internal(uchar *dst, const char16_t *src, qsizetype len
}
# if !defined(__OPTIMIZE_SIZE__)
- // we're going to write to dst[offset..offset+7] (8 bytes)
- if (dst + offset + 7 < e) {
- __m128i chunk = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src + offset));
- chunk = mergeQuestionMarks(chunk);
-
- // pack, where the upper half is ignored
- const __m128i result = _mm_packus_epi16(chunk, chunk);
- _mm_storel_epi64(reinterpret_cast<__m128i *>(dst + offset), result);
- offset += 8;
- }
+ if (length >= 4) {
+ // this code is fine even for in-place conversion because we load both
+ // before any store
+ if (length >= 8) {
+ __m128i chunk1 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src));
+ __m128i chunk2 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src + length - 8));
+ chunk1 = mergeQuestionMarks(chunk1);
+ chunk2 = mergeQuestionMarks(chunk2);
- // we're going to write to dst[offset..offset+3] (4 bytes)
- if (dst + offset + 3 < e) {
- __m128i chunk = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(src + offset));
- chunk = mergeQuestionMarks(chunk);
+ // pack, where the upper half is ignored
+ const __m128i result1 = _mm_packus_epi16(chunk1, chunk1);
+ const __m128i result2 = _mm_packus_epi16(chunk2, chunk2);
+ _mm_storel_epi64(reinterpret_cast<__m128i *>(dst), result1);
+ _mm_storel_epi64(reinterpret_cast<__m128i *>(dst + length - 8), result2);
+ } else {
+ __m128i chunk1 = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(src));
+ __m128i chunk2 = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(src + length - 4));
+ chunk1 = mergeQuestionMarks(chunk1);
+ chunk2 = mergeQuestionMarks(chunk2);
- // pack, we'll the upper three quarters
- const __m128i result = _mm_packus_epi16(chunk, chunk);
- qToUnaligned(_mm_cvtsi128_si32(result), dst + offset);
- offset += 4;
+ // pack, we'll zero the upper three quarters
+ const __m128i result1 = _mm_packus_epi16(chunk1, chunk1);
+ const __m128i result2 = _mm_packus_epi16(chunk2, chunk2);
+ qToUnaligned(_mm_cvtsi128_si32(result1), dst);
+ qToUnaligned(_mm_cvtsi128_si32(result2), dst + length - 4);
+ }
+ return;
}
length = length % 4;
-# else
- length = length % 16;
-# endif // optimize size
-
- // advance dst, src for tail processing
- dst += offset;
- src += offset;
-
-# if !defined(__OPTIMIZE_SIZE__)
return UnrollTailLoop<3>::exec(length, [=](qsizetype i) {
if (Checked)
dst[i] = (src[i]>0xff) ? '?' : (uchar) src[i];
else
dst[i] = src[i];
});
-# endif
+# else
+ length = length % 16;
+# endif // optimize size
#elif defined(__ARM_NEON__)
// Refer to the documentation of the SSE2 implementation.
// This uses exactly the same method as for SSE except: