summaryrefslogtreecommitdiffstats
path: root/src/corelib
diff options
context:
space:
mode:
authorThiago Macieira <thiago.macieira@intel.com>2021-12-21 16:44:01 -0300
committerThiago Macieira <thiago.macieira@intel.com>2022-12-04 17:56:45 -0800
commit2b9d4afc95a6e716f7bb1839df4041e454aa52af (patch)
treede3947dfc8175b8a3851107d03e27230846eaff5 /src/corelib
parent3b528670e6d5cbe25e0892b484b3e93417e263d3 (diff)
QString::{to,from}Latin1: add the ability to do overlapping tails
If the string length is larger than the number of characters we can operate on with a single vector loop, we can transform the tail using a vector too, just overlapping up to 15 characters with the last iteration o the loop. Change-Id: Ib42b3adc93bf4d43bd55fffd16c2dcab115e50f7 Reviewed-by: Lars Knoll <lars@knoll.priv.no>
Diffstat (limited to 'src/corelib')
-rw-r--r--src/corelib/text/qstring.cpp44
1 files changed, 31 insertions, 13 deletions
diff --git a/src/corelib/text/qstring.cpp b/src/corelib/text/qstring.cpp
index 90e791d0d3..f41e9377eb 100644
--- a/src/corelib/text/qstring.cpp
+++ b/src/corelib/text/qstring.cpp
@@ -803,11 +803,8 @@ Q_CORE_EXPORT void qt_from_latin1(char16_t *dst, const char *str, size_t size) n
* itself in exactly the same way as one would do it with intrinsics.
*/
#if defined(__SSE2__)
- const char *e = str + size;
- qptrdiff offset = 0;
-
// we're going to read str[offset..offset+15] (16 bytes)
- for ( ; str + offset + 15 < e; offset += 16) {
+ auto processOneChunk = [=](qptrdiff offset) {
const __m128i chunk = _mm_loadu_si128((const __m128i*)(str + offset)); // load
if constexpr (UseAvx2) {
// zero extend to an YMM register
@@ -826,10 +823,21 @@ Q_CORE_EXPORT void qt_from_latin1(char16_t *dst, const char *str, size_t size) n
const __m128i secondHalf = _mm_unpackhi_epi8 (chunk, nullMask);
_mm_storeu_si128((__m128i*)(dst + offset + 8), secondHalf); // store
}
+ };
+
+ const char *e = str + size;
+ qptrdiff offset = 0;
+ if (size >= sizeof(__m128i)) {
+ for ( ; str + offset + sizeof(__m128i) <= e; offset += sizeof(__m128i))
+ processOneChunk(offset);
+ if (str + offset < e)
+ processOneChunk(size - sizeof(__m128i));
+ return;
}
+# if !defined(__OPTIMIZE_SIZE__)
// we're going to read str[offset..offset+7] (8 bytes)
- if (str + offset + 7 < e) {
+ if (str + offset + 8 <= e) {
const __m128i unpacked = mm_load8_zero_extend(str + offset);
_mm_storeu_si128(reinterpret_cast<__m128i *>(dst + offset), unpacked);
offset += 8;
@@ -838,7 +846,6 @@ Q_CORE_EXPORT void qt_from_latin1(char16_t *dst, const char *str, size_t size) n
size = size % 8;
dst += offset;
str += offset;
-# if !defined(__OPTIMIZE_SIZE__)
return UnrollTailLoop<7>::exec(qsizetype(size), [=](qsizetype i) { dst[i] = (uchar)str[i]; });
# endif
#endif
@@ -859,9 +866,6 @@ template <bool Checked>
static void qt_to_latin1_internal(uchar *dst, const char16_t *src, qsizetype length)
{
#if defined(__SSE2__)
- uchar *e = dst + length;
- qptrdiff offset = 0;
-
auto questionMark256 = []() {
if constexpr (UseAvx2)
return _mm256_broadcastw_epi16(_mm_cvtsi32_si128('?'));
@@ -917,8 +921,8 @@ static void qt_to_latin1_internal(uchar *dst, const char16_t *src, qsizetype len
return chunk;
};
- // we're going to write to dst[offset..offset+15] (16 bytes)
- for ( ; dst + offset + 15 < e; offset += 16) {
+ // we're going to read to src[offset..offset+15] (16 bytes)
+ auto loadChunkAt = [=](qptrdiff offset) {
__m128i chunk1, chunk2;
if constexpr (UseAvx2) {
__m256i chunk = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src + offset));
@@ -940,8 +944,22 @@ static void qt_to_latin1_internal(uchar *dst, const char16_t *src, qsizetype len
}
// pack the two vector to 16 x 8bits elements
- const __m128i result = _mm_packus_epi16(chunk1, chunk2);
- _mm_storeu_si128((__m128i*)(dst + offset), result); // store
+ return _mm_packus_epi16(chunk1, chunk2);
+ };
+
+ uchar *e = dst + length;
+ qptrdiff offset = 0;
+ if (size_t(length) >= sizeof(__m128i)) {
+ // because of possible overlapping, we won't process the last chunk in the loop
+ for ( ; offset + 2 * sizeof(__m128i) < size_t(length); offset += sizeof(__m128i))
+ _mm_storeu_si128(reinterpret_cast<__m128i *>(dst + offset), loadChunkAt(offset));
+
+ // overlapped conversion of the last full chunk and the tail
+ __m128i last1 = loadChunkAt(offset);
+ __m128i last2 = loadChunkAt(length - sizeof(__m128i));
+ _mm_storeu_si128(reinterpret_cast<__m128i *>(dst + offset), last1);
+ _mm_storeu_si128(reinterpret_cast<__m128i *>(dst + length - sizeof(__m128i)), last2);
+ return;
}
# if !defined(__OPTIMIZE_SIZE__)