QUtf8: Add some UTF-8 text operation functions

The first, isValidUtf8(), as the name says, returns true if the string is valid UTF-8. As a bonus, it also returns whether it's valid US-ASCII. The other two are meant to compare an UTF-8 string to either a Latin1 one or an UTF-8 one, without memory allocation. Change-Id: Ic38ec929fc3f4bb795dafffd150ad0d63e28cd32 Reviewed-by: Lars Knoll <lars.knoll@qt.io>
author: Thiago Macieira <thiago.macieira@intel.com> 2018-01-17 21:37:23 -0800
committer: Thiago Macieira <thiago.macieira@intel.com> 2018-02-03 21:31:57 +0000
commit: a16d0074b9f2847fe475daa89c8174fd9d9898c0 (patch)
tree: ef96d0f84b7ee65b6a8e22ce1a991cad981c3ab6 /src/corelib/codecs/qutfcodec.cpp
parent: 687dc7fac7fec6c2f77aa1d130119a8bc1fd368b (diff)
1 files changed, 162 insertions, 1 deletions
diff --git a/src/corelib/codecs/qutfcodec.cpp b/src/corelib/codecs/qutfcodec.cpp
index 74a716db4a..ce1b092a54 100644
--- a/src/corelib/codecs/qutfcodec.cpp
+++ b/src/corelib/codecs/qutfcodec.cpp
@@ -1,7 +1,7 @@
 /****************************************************************************
 **
 ** Copyright (C) 2016 The Qt Company Ltd.
-** Copyright (C) 2016 Intel Corporation.
+** Copyright (C) 2018 Intel Corporation.
 ** Contact: https://www.qt.io/licensing/
 **
 ** This file is part of the QtCore module of the Qt Toolkit.
@@ -152,6 +152,44 @@ static inline bool simdDecodeAscii(ushort *&dst, const uchar *&nextAscii, const
     }
     return src == end;
 }
+
+static inline const uchar *simdFindNonAscii(const uchar *src, const uchar *end, const uchar *&nextAscii)
+{
+    // do sixteen characters at a time
+    for ( ; end - src >= 16; src += 16) {
+        __m128i data = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src));
+
+        // check if everything is ASCII
+        // movemask extracts the high bit of every byte, so n is non-zero if something isn't ASCII
+        uint n = _mm_movemask_epi8(data);
+        if (!n)
+            continue;
+
+        // find the next probable ASCII character
+        // we don't want to load 16 bytes again in this loop if we know there are non-ASCII
+        // characters still coming
+        nextAscii = src + qBitScanReverse(n) + 1;
+
+        // return the non-ASCII character
+        return src + qCountTrailingZeroBits(n);
+    }
+
+    // do four characters at a time
+    for ( ; end - src >= 4; src += 4) {
+        quint32 data = qFromUnaligned<quint32>(src);
+        data &= 0x80808080U;
+        if (!data)
+            continue;
+
+        // We don't try to guess which of the three bytes is ASCII and which
+        // one isn't. The chance that at least two of them are non-ASCII is
+        // better than 75%.
+        nextAscii = src;
+        return src;
+    }
+    nextAscii = end;
+    return src;
+}
 #elif defined(__ARM_NEON__) && defined(Q_PROCESSOR_ARM_64) // vaddv is only available on Aarch64
 static inline bool simdEncodeAscii(uchar *&dst, const ushort *&nextAscii, const ushort *&src, const ushort *end)
 {
@@ -220,6 +258,34 @@ static inline bool simdDecodeAscii(ushort *&dst, const uchar *&nextAscii, const
     }
     return src == end;
 }
+
+static inline const uchar *simdFindNonAscii(const uchar *src, const uchar *end, const uchar *&nextAscii)
+{
+    // The SIMD code below is untested, so just force an early return until
+    // we've had the time to verify it works.
+    nextAscii = end;
+    return src;
+
+    // do eight characters at a time
+    uint8x8_t msb_mask = vdup_n_u8(0x80);
+    uint8x8_t add_mask = { 1, 1 << 1, 1 << 2, 1 << 3, 1 << 4, 1 << 5, 1 << 6, 1 << 7 };
+    for ( ; end - src >= 8; src += 8) {
+        uint8x8_t c = vld1_u8(src);
+        uint8_t n = vaddv_u8(vand_u8(vcge_u8(c, msb_mask), add_mask));
+        if (!n)
+            continue;
+
+        // find the next probable ASCII character
+        // we don't want to load 16 bytes again in this loop if we know there are non-ASCII
+        // characters still coming
+        nextAscii = src + qBitScanReverse(n) + 1;
+
+        // return the non-ASCII character
+        return src + qCountTrailingZeroBits(n);
+    }
+    nextAscii = end;
+    return src;
+}
 #else
 static inline bool simdEncodeAscii(uchar *, const ushort *, const ushort *, const ushort *)
 {
@@ -230,6 +296,12 @@ static inline bool simdDecodeAscii(ushort *, const uchar *, const uchar *, const
 {
     return false;
 }
+
+static inline const uchar *simdFindNonAscii(const uchar *src, const uchar *end, const uchar *&nextAscii)
+{
+    nextAscii = end;
+    return src;
+}
 #endif
 
 QByteArray QUtf8::convertFromUnicode(const QChar *uc, int len)
@@ -518,6 +590,95 @@ QString QUtf8::convertToUnicode(const char *chars, int len, QTextCodec::Converte
     return result;
 }
 
+struct QUtf8NoOutputTraits : public QUtf8BaseTraitsNoAscii
+{
+    struct NoOutput {};
+    static void appendUtf16(const NoOutput &, ushort) {}
+    static void appendUcs4(const NoOutput &, uint) {}
+};
+
+QUtf8::ValidUtf8Result QUtf8::isValidUtf8(const char *chars, qsizetype len)
+{
+    const uchar *src = reinterpret_cast<const uchar *>(chars);
+    const uchar *end = src + len;
+    const uchar *nextAscii = src;
+    bool isValidAscii = true;
+
+    while (src < end) {
+        if (src >= nextAscii)
+            src = simdFindNonAscii(src, end, nextAscii);
+        if (src == end)
+            break;
+
+        do {
+            uchar b = *src++;
+            if ((b & 0x80) == 0)
+                continue;
+
+            isValidAscii = false;
+            QUtf8NoOutputTraits::NoOutput output;
+            int res = QUtf8Functions::fromUtf8<QUtf8NoOutputTraits>(b, output, src, end);
+            if (res < 0) {
+                // decoding error
+                return { false, false };
+            }
+        } while (src < nextAscii);
+    }
+
+    return { true, isValidAscii };
+}
+
+int QUtf8::compareUtf8(const char *utf8, qsizetype u8len, const QChar *utf16, int u16len)
+{
+    uint uc1, uc2;
+    auto src1 = reinterpret_cast<const uchar *>(utf8);
+    auto end1 = src1 + u8len;
+    QStringIterator src2(utf16, utf16 + u16len);
+
+    while (src1 < end1 && src2.hasNext()) {
+        uchar b = *src1++;
+        uint *output = &uc1;
+        int res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(b, output, src1, end1);
+        if (res < 0) {
+            // decoding error
+            uc1 = QChar::ReplacementCharacter;
+        }
+
+        uc2 = src2.next();
+        if (uc1 != uc2)
+            return int(uc1) - int(uc2);
+    }
+
+    // the shorter string sorts first
+    return (end1 > src1) - int(src2.hasNext());
+}
+
+int QUtf8::compareUtf8(const char *utf8, qsizetype u8len, QLatin1String s)
+{
+    uint uc1;
+    auto src1 = reinterpret_cast<const uchar *>(utf8);
+    auto end1 = src1 + u8len;
+    auto src2 = reinterpret_cast<const uchar *>(s.latin1());
+    auto end2 = src2 + s.size();
+
+    while (src1 < end1 && src2 < end2) {
+        uchar b = *src1++;
+        uint *output = &uc1;
+        int res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(b, output, src1, end1);
+        if (res < 0) {
+            // decoding error
+            uc1 = QChar::ReplacementCharacter;
+        }
+
+        uint uc2 = *src2++;
+        if (uc1 != uc2)
+            return int(uc1) - int(uc2);
+    }
+
+    // the shorter string sorts first
+    return (end1 > src1) - (end2 > src2);
+}
+
 QByteArray QUtf16::convertFromUnicode(const QChar *uc, int len, QTextCodec::ConverterState *state, DataEndianness e)
 {
     DataEndianness endian = e;
author	Thiago Macieira <thiago.macieira@intel.com>	2018-01-17 21:37:23 -0800
committer	Thiago Macieira <thiago.macieira@intel.com>	2018-02-03 21:31:57 +0000
commit	a16d0074b9f2847fe475daa89c8174fd9d9898c0 (patch)
tree	ef96d0f84b7ee65b6a8e22ce1a991cad981c3ab6 /src/corelib/codecs/qutfcodec.cpp
parent	687dc7fac7fec6c2f77aa1d130119a8bc1fd368b (diff)