From 274e07a339ebedddd8b3366342ebde3cbfb9536a Mon Sep 17 00:00:00 2001 From: Thiago Macieira Date: Fri, 22 May 2020 17:42:26 -0700 Subject: Optimize non-vectorized UTF-8 to UTF-16 comparison to US-ASCII This allows us to skip the surrogate pair decoding too, since it can't match anyway. Change-Id: Ied637aece2a7427b8a2dfffd16118183e5d76794 Reviewed-by: Lars Knoll --- src/corelib/text/qstringconverter.cpp | 34 +++++++++++++++++++++------------- 1 file changed, 21 insertions(+), 13 deletions(-) diff --git a/src/corelib/text/qstringconverter.cpp b/src/corelib/text/qstringconverter.cpp index 2111d22b2f..1c81248910 100644 --- a/src/corelib/text/qstringconverter.cpp +++ b/src/corelib/text/qstringconverter.cpp @@ -1,7 +1,7 @@ /**************************************************************************** ** ** Copyright (C) 2020 The Qt Company Ltd. -** Copyright (C) 2018 Intel Corporation. +** Copyright (C) 2020 Intel Corporation. ** Contact: https://www.qt.io/licensing/ ** ** This file is part of the QtCore module of the Qt Toolkit. @@ -715,27 +715,35 @@ QUtf8::ValidUtf8Result QUtf8::isValidUtf8(const char *chars, qsizetype len) int QUtf8::compareUtf8(const char *utf8, qsizetype u8len, const QChar *utf16, qsizetype u16len) noexcept { - uint uc1, uc2; - auto src1 = reinterpret_cast(utf8); + auto src1 = reinterpret_cast(utf8); auto end1 = src1 + u8len; - QStringIterator src2(utf16, utf16 + u16len); + auto src2 = reinterpret_cast(utf16); + auto end2 = src2 + u16len; - while (src1 < end1 && src2.hasNext()) { - uchar b = *src1++; - uint *output = &uc1; - int res = QUtf8Functions::fromUtf8(b, output, src1, end1); - if (res < 0) { - // decoding error - uc1 = QChar::ReplacementCharacter; + while (src1 < end1 && src2 < end2) { + char32_t uc1 = *src1++; + char32_t uc2 = *src2++; + + if (uc1 >= 0x80) { + char32_t *output = &uc1; + int res = QUtf8Functions::fromUtf8(uc1, output, src1, end1); + if (res < 0) { + // decoding error + uc1 = QChar::ReplacementCharacter; + } + + // Only decode the UTF-16 surrogate pair if the UTF-8 code point + // wasn't US-ASCII (a surrogate cannot match US-ASCII). + if (QChar::isHighSurrogate(uc2) && src2 < end2 && QChar::isLowSurrogate(*src2)) + uc2 = QChar::surrogateToUcs4(uc2, *src2++); } - uc2 = src2.next(); if (uc1 != uc2) return int(uc1) - int(uc2); } // the shorter string sorts first - return (end1 > src1) - int(src2.hasNext()); + return (end1 > src1) - int(end2 > src2); } int QUtf8::compareUtf8(const char *utf8, qsizetype u8len, QLatin1String s) -- cgit v1.2.3