summaryrefslogtreecommitdiffstats
path: root/src/corelib/codecs/qutfcodec.cpp
diff options
context:
space:
mode:
authorThiago Macieira <thiago.macieira@intel.com>2018-01-17 21:37:23 -0800
committerThiago Macieira <thiago.macieira@intel.com>2018-02-03 21:31:57 +0000
commita16d0074b9f2847fe475daa89c8174fd9d9898c0 (patch)
treeef96d0f84b7ee65b6a8e22ce1a991cad981c3ab6 /src/corelib/codecs/qutfcodec.cpp
parent687dc7fac7fec6c2f77aa1d130119a8bc1fd368b (diff)
QUtf8: Add some UTF-8 text operation functions
The first, isValidUtf8(), as the name says, returns true if the string is valid UTF-8. As a bonus, it also returns whether it's valid US-ASCII. The other two are meant to compare an UTF-8 string to either a Latin1 one or an UTF-8 one, without memory allocation. Change-Id: Ic38ec929fc3f4bb795dafffd150ad0d63e28cd32 Reviewed-by: Lars Knoll <lars.knoll@qt.io>
Diffstat (limited to 'src/corelib/codecs/qutfcodec.cpp')
-rw-r--r--src/corelib/codecs/qutfcodec.cpp163
1 files changed, 162 insertions, 1 deletions
diff --git a/src/corelib/codecs/qutfcodec.cpp b/src/corelib/codecs/qutfcodec.cpp
index 74a716db4a..ce1b092a54 100644
--- a/src/corelib/codecs/qutfcodec.cpp
+++ b/src/corelib/codecs/qutfcodec.cpp
@@ -1,7 +1,7 @@
/****************************************************************************
**
** Copyright (C) 2016 The Qt Company Ltd.
-** Copyright (C) 2016 Intel Corporation.
+** Copyright (C) 2018 Intel Corporation.
** Contact: https://www.qt.io/licensing/
**
** This file is part of the QtCore module of the Qt Toolkit.
@@ -152,6 +152,44 @@ static inline bool simdDecodeAscii(ushort *&dst, const uchar *&nextAscii, const
}
return src == end;
}
+
+static inline const uchar *simdFindNonAscii(const uchar *src, const uchar *end, const uchar *&nextAscii)
+{
+ // do sixteen characters at a time
+ for ( ; end - src >= 16; src += 16) {
+ __m128i data = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src));
+
+ // check if everything is ASCII
+ // movemask extracts the high bit of every byte, so n is non-zero if something isn't ASCII
+ uint n = _mm_movemask_epi8(data);
+ if (!n)
+ continue;
+
+ // find the next probable ASCII character
+ // we don't want to load 16 bytes again in this loop if we know there are non-ASCII
+ // characters still coming
+ nextAscii = src + qBitScanReverse(n) + 1;
+
+ // return the non-ASCII character
+ return src + qCountTrailingZeroBits(n);
+ }
+
+ // do four characters at a time
+ for ( ; end - src >= 4; src += 4) {
+ quint32 data = qFromUnaligned<quint32>(src);
+ data &= 0x80808080U;
+ if (!data)
+ continue;
+
+ // We don't try to guess which of the three bytes is ASCII and which
+ // one isn't. The chance that at least two of them are non-ASCII is
+ // better than 75%.
+ nextAscii = src;
+ return src;
+ }
+ nextAscii = end;
+ return src;
+}
#elif defined(__ARM_NEON__) && defined(Q_PROCESSOR_ARM_64) // vaddv is only available on Aarch64
static inline bool simdEncodeAscii(uchar *&dst, const ushort *&nextAscii, const ushort *&src, const ushort *end)
{
@@ -220,6 +258,34 @@ static inline bool simdDecodeAscii(ushort *&dst, const uchar *&nextAscii, const
}
return src == end;
}
+
+static inline const uchar *simdFindNonAscii(const uchar *src, const uchar *end, const uchar *&nextAscii)
+{
+ // The SIMD code below is untested, so just force an early return until
+ // we've had the time to verify it works.
+ nextAscii = end;
+ return src;
+
+ // do eight characters at a time
+ uint8x8_t msb_mask = vdup_n_u8(0x80);
+ uint8x8_t add_mask = { 1, 1 << 1, 1 << 2, 1 << 3, 1 << 4, 1 << 5, 1 << 6, 1 << 7 };
+ for ( ; end - src >= 8; src += 8) {
+ uint8x8_t c = vld1_u8(src);
+ uint8_t n = vaddv_u8(vand_u8(vcge_u8(c, msb_mask), add_mask));
+ if (!n)
+ continue;
+
+ // find the next probable ASCII character
+ // we don't want to load 16 bytes again in this loop if we know there are non-ASCII
+ // characters still coming
+ nextAscii = src + qBitScanReverse(n) + 1;
+
+ // return the non-ASCII character
+ return src + qCountTrailingZeroBits(n);
+ }
+ nextAscii = end;
+ return src;
+}
#else
static inline bool simdEncodeAscii(uchar *, const ushort *, const ushort *, const ushort *)
{
@@ -230,6 +296,12 @@ static inline bool simdDecodeAscii(ushort *, const uchar *, const uchar *, const
{
return false;
}
+
+static inline const uchar *simdFindNonAscii(const uchar *src, const uchar *end, const uchar *&nextAscii)
+{
+ nextAscii = end;
+ return src;
+}
#endif
QByteArray QUtf8::convertFromUnicode(const QChar *uc, int len)
@@ -518,6 +590,95 @@ QString QUtf8::convertToUnicode(const char *chars, int len, QTextCodec::Converte
return result;
}
+struct QUtf8NoOutputTraits : public QUtf8BaseTraitsNoAscii
+{
+ struct NoOutput {};
+ static void appendUtf16(const NoOutput &, ushort) {}
+ static void appendUcs4(const NoOutput &, uint) {}
+};
+
+QUtf8::ValidUtf8Result QUtf8::isValidUtf8(const char *chars, qsizetype len)
+{
+ const uchar *src = reinterpret_cast<const uchar *>(chars);
+ const uchar *end = src + len;
+ const uchar *nextAscii = src;
+ bool isValidAscii = true;
+
+ while (src < end) {
+ if (src >= nextAscii)
+ src = simdFindNonAscii(src, end, nextAscii);
+ if (src == end)
+ break;
+
+ do {
+ uchar b = *src++;
+ if ((b & 0x80) == 0)
+ continue;
+
+ isValidAscii = false;
+ QUtf8NoOutputTraits::NoOutput output;
+ int res = QUtf8Functions::fromUtf8<QUtf8NoOutputTraits>(b, output, src, end);
+ if (res < 0) {
+ // decoding error
+ return { false, false };
+ }
+ } while (src < nextAscii);
+ }
+
+ return { true, isValidAscii };
+}
+
+int QUtf8::compareUtf8(const char *utf8, qsizetype u8len, const QChar *utf16, int u16len)
+{
+ uint uc1, uc2;
+ auto src1 = reinterpret_cast<const uchar *>(utf8);
+ auto end1 = src1 + u8len;
+ QStringIterator src2(utf16, utf16 + u16len);
+
+ while (src1 < end1 && src2.hasNext()) {
+ uchar b = *src1++;
+ uint *output = &uc1;
+ int res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(b, output, src1, end1);
+ if (res < 0) {
+ // decoding error
+ uc1 = QChar::ReplacementCharacter;
+ }
+
+ uc2 = src2.next();
+ if (uc1 != uc2)
+ return int(uc1) - int(uc2);
+ }
+
+ // the shorter string sorts first
+ return (end1 > src1) - int(src2.hasNext());
+}
+
+int QUtf8::compareUtf8(const char *utf8, qsizetype u8len, QLatin1String s)
+{
+ uint uc1;
+ auto src1 = reinterpret_cast<const uchar *>(utf8);
+ auto end1 = src1 + u8len;
+ auto src2 = reinterpret_cast<const uchar *>(s.latin1());
+ auto end2 = src2 + s.size();
+
+ while (src1 < end1 && src2 < end2) {
+ uchar b = *src1++;
+ uint *output = &uc1;
+ int res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(b, output, src1, end1);
+ if (res < 0) {
+ // decoding error
+ uc1 = QChar::ReplacementCharacter;
+ }
+
+ uint uc2 = *src2++;
+ if (uc1 != uc2)
+ return int(uc1) - int(uc2);
+ }
+
+ // the shorter string sorts first
+ return (end1 > src1) - (end2 > src2);
+}
+
QByteArray QUtf16::convertFromUnicode(const QChar *uc, int len, QTextCodec::ConverterState *state, DataEndianness e)
{
DataEndianness endian = e;