From a16d0074b9f2847fe475daa89c8174fd9d9898c0 Mon Sep 17 00:00:00 2001 From: Thiago Macieira Date: Wed, 17 Jan 2018 21:37:23 -0800 Subject: QUtf8: Add some UTF-8 text operation functions The first, isValidUtf8(), as the name says, returns true if the string is valid UTF-8. As a bonus, it also returns whether it's valid US-ASCII. The other two are meant to compare an UTF-8 string to either a Latin1 one or an UTF-8 one, without memory allocation. Change-Id: Ic38ec929fc3f4bb795dafffd150ad0d63e28cd32 Reviewed-by: Lars Knoll --- src/corelib/codecs/qutfcodec.cpp | 163 ++++++++++++++++++++++++++++++++++++++- src/corelib/codecs/qutfcodec_p.h | 7 ++ 2 files changed, 169 insertions(+), 1 deletion(-) (limited to 'src/corelib') diff --git a/src/corelib/codecs/qutfcodec.cpp b/src/corelib/codecs/qutfcodec.cpp index 74a716db4a..ce1b092a54 100644 --- a/src/corelib/codecs/qutfcodec.cpp +++ b/src/corelib/codecs/qutfcodec.cpp @@ -1,7 +1,7 @@ /**************************************************************************** ** ** Copyright (C) 2016 The Qt Company Ltd. -** Copyright (C) 2016 Intel Corporation. +** Copyright (C) 2018 Intel Corporation. ** Contact: https://www.qt.io/licensing/ ** ** This file is part of the QtCore module of the Qt Toolkit. @@ -152,6 +152,44 @@ static inline bool simdDecodeAscii(ushort *&dst, const uchar *&nextAscii, const } return src == end; } + +static inline const uchar *simdFindNonAscii(const uchar *src, const uchar *end, const uchar *&nextAscii) +{ + // do sixteen characters at a time + for ( ; end - src >= 16; src += 16) { + __m128i data = _mm_loadu_si128(reinterpret_cast(src)); + + // check if everything is ASCII + // movemask extracts the high bit of every byte, so n is non-zero if something isn't ASCII + uint n = _mm_movemask_epi8(data); + if (!n) + continue; + + // find the next probable ASCII character + // we don't want to load 16 bytes again in this loop if we know there are non-ASCII + // characters still coming + nextAscii = src + qBitScanReverse(n) + 1; + + // return the non-ASCII character + return src + qCountTrailingZeroBits(n); + } + + // do four characters at a time + for ( ; end - src >= 4; src += 4) { + quint32 data = qFromUnaligned(src); + data &= 0x80808080U; + if (!data) + continue; + + // We don't try to guess which of the three bytes is ASCII and which + // one isn't. The chance that at least two of them are non-ASCII is + // better than 75%. + nextAscii = src; + return src; + } + nextAscii = end; + return src; +} #elif defined(__ARM_NEON__) && defined(Q_PROCESSOR_ARM_64) // vaddv is only available on Aarch64 static inline bool simdEncodeAscii(uchar *&dst, const ushort *&nextAscii, const ushort *&src, const ushort *end) { @@ -220,6 +258,34 @@ static inline bool simdDecodeAscii(ushort *&dst, const uchar *&nextAscii, const } return src == end; } + +static inline const uchar *simdFindNonAscii(const uchar *src, const uchar *end, const uchar *&nextAscii) +{ + // The SIMD code below is untested, so just force an early return until + // we've had the time to verify it works. + nextAscii = end; + return src; + + // do eight characters at a time + uint8x8_t msb_mask = vdup_n_u8(0x80); + uint8x8_t add_mask = { 1, 1 << 1, 1 << 2, 1 << 3, 1 << 4, 1 << 5, 1 << 6, 1 << 7 }; + for ( ; end - src >= 8; src += 8) { + uint8x8_t c = vld1_u8(src); + uint8_t n = vaddv_u8(vand_u8(vcge_u8(c, msb_mask), add_mask)); + if (!n) + continue; + + // find the next probable ASCII character + // we don't want to load 16 bytes again in this loop if we know there are non-ASCII + // characters still coming + nextAscii = src + qBitScanReverse(n) + 1; + + // return the non-ASCII character + return src + qCountTrailingZeroBits(n); + } + nextAscii = end; + return src; +} #else static inline bool simdEncodeAscii(uchar *, const ushort *, const ushort *, const ushort *) { @@ -230,6 +296,12 @@ static inline bool simdDecodeAscii(ushort *, const uchar *, const uchar *, const { return false; } + +static inline const uchar *simdFindNonAscii(const uchar *src, const uchar *end, const uchar *&nextAscii) +{ + nextAscii = end; + return src; +} #endif QByteArray QUtf8::convertFromUnicode(const QChar *uc, int len) @@ -518,6 +590,95 @@ QString QUtf8::convertToUnicode(const char *chars, int len, QTextCodec::Converte return result; } +struct QUtf8NoOutputTraits : public QUtf8BaseTraitsNoAscii +{ + struct NoOutput {}; + static void appendUtf16(const NoOutput &, ushort) {} + static void appendUcs4(const NoOutput &, uint) {} +}; + +QUtf8::ValidUtf8Result QUtf8::isValidUtf8(const char *chars, qsizetype len) +{ + const uchar *src = reinterpret_cast(chars); + const uchar *end = src + len; + const uchar *nextAscii = src; + bool isValidAscii = true; + + while (src < end) { + if (src >= nextAscii) + src = simdFindNonAscii(src, end, nextAscii); + if (src == end) + break; + + do { + uchar b = *src++; + if ((b & 0x80) == 0) + continue; + + isValidAscii = false; + QUtf8NoOutputTraits::NoOutput output; + int res = QUtf8Functions::fromUtf8(b, output, src, end); + if (res < 0) { + // decoding error + return { false, false }; + } + } while (src < nextAscii); + } + + return { true, isValidAscii }; +} + +int QUtf8::compareUtf8(const char *utf8, qsizetype u8len, const QChar *utf16, int u16len) +{ + uint uc1, uc2; + auto src1 = reinterpret_cast(utf8); + auto end1 = src1 + u8len; + QStringIterator src2(utf16, utf16 + u16len); + + while (src1 < end1 && src2.hasNext()) { + uchar b = *src1++; + uint *output = &uc1; + int res = QUtf8Functions::fromUtf8(b, output, src1, end1); + if (res < 0) { + // decoding error + uc1 = QChar::ReplacementCharacter; + } + + uc2 = src2.next(); + if (uc1 != uc2) + return int(uc1) - int(uc2); + } + + // the shorter string sorts first + return (end1 > src1) - int(src2.hasNext()); +} + +int QUtf8::compareUtf8(const char *utf8, qsizetype u8len, QLatin1String s) +{ + uint uc1; + auto src1 = reinterpret_cast(utf8); + auto end1 = src1 + u8len; + auto src2 = reinterpret_cast(s.latin1()); + auto end2 = src2 + s.size(); + + while (src1 < end1 && src2 < end2) { + uchar b = *src1++; + uint *output = &uc1; + int res = QUtf8Functions::fromUtf8(b, output, src1, end1); + if (res < 0) { + // decoding error + uc1 = QChar::ReplacementCharacter; + } + + uint uc2 = *src2++; + if (uc1 != uc2) + return int(uc1) - int(uc2); + } + + // the shorter string sorts first + return (end1 > src1) - (end2 > src2); +} + QByteArray QUtf16::convertFromUnicode(const QChar *uc, int len, QTextCodec::ConverterState *state, DataEndianness e) { DataEndianness endian = e; diff --git a/src/corelib/codecs/qutfcodec_p.h b/src/corelib/codecs/qutfcodec_p.h index 36752cc909..659a229dae 100644 --- a/src/corelib/codecs/qutfcodec_p.h +++ b/src/corelib/codecs/qutfcodec_p.h @@ -290,6 +290,13 @@ struct QUtf8 static QString convertToUnicode(const char *, int, QTextCodec::ConverterState *); static QByteArray convertFromUnicode(const QChar *, int); static QByteArray convertFromUnicode(const QChar *, int, QTextCodec::ConverterState *); + struct ValidUtf8Result { + bool isValidUtf8; + bool isValidAscii; + }; + static ValidUtf8Result isValidUtf8(const char *, qsizetype); + static int compareUtf8(const char *, qsizetype, const QChar *, int); + static int compareUtf8(const char *, qsizetype, QLatin1String s); }; struct QUtf16 -- cgit v1.2.3