diff options
Diffstat (limited to 'src/corelib/codecs/qutfcodec.cpp')
-rw-r--r-- | src/corelib/codecs/qutfcodec.cpp | 375 |
1 files changed, 244 insertions, 131 deletions
diff --git a/src/corelib/codecs/qutfcodec.cpp b/src/corelib/codecs/qutfcodec.cpp index e425f8634c..20bacb1584 100644 --- a/src/corelib/codecs/qutfcodec.cpp +++ b/src/corelib/codecs/qutfcodec.cpp @@ -1,6 +1,7 @@ /**************************************************************************** ** ** Copyright (C) 2013 Digia Plc and/or its subsidiary(-ies). +** Copyright (C) 2013 Intel Corporation ** Contact: http://www.qt-project.org/legal ** ** This file is part of the QtCore module of the Qt Toolkit. @@ -44,10 +45,124 @@ #include "qendian.h" #include "qchar.h" +#include "private/qsimd_p.h" + QT_BEGIN_NAMESPACE enum { Endian = 0, Data = 1 }; +#if defined(__SSE2__) && defined(QT_COMPILER_SUPPORTS_SSE2) +static inline bool simdEncodeAscii(uchar *&dst, const ushort *&nextAscii, const ushort *&src, const ushort *end) +{ + // do sixteen characters at a time + for ( ; end - src >= 16; src += 16, dst += 16) { + __m128i data1 = _mm_loadu_si128((__m128i*)src); + __m128i data2 = _mm_loadu_si128(1+(__m128i*)src); + + + // check if everything is ASCII + // the highest ASCII value is U+007F + // Do the packing directly: + // The PACKUSWB instruction has packs a signed 16-bit integer to an unsigned 8-bit + // with saturation. That is, anything from 0x0100 to 0x7fff is saturated to 0xff, + // while all negatives (0x8000 to 0xffff) get saturated to 0x00. To detect non-ASCII, + // we simply do a signed greater-than comparison to 0x00. That means we detect NULs as + // "non-ASCII", but it's an acceptable compromise. + __m128i packed = _mm_packus_epi16(data1, data2); + __m128i nonAscii = _mm_cmpgt_epi8(packed, _mm_setzero_si128()); + + // n will contain 1 bit set per character in [data1, data2] that is non-ASCII (or NUL) + ushort n = ~_mm_movemask_epi8(nonAscii); + if (n) { + // copy the front part that is still ASCII + while (!(n & 1)) { + *dst++ = *src++; + n >>= 1; + } + + // find the next probable ASCII character + // we don't want to load 32 bytes again in this loop if we know there are non-ASCII + // characters still coming + n = _bit_scan_reverse(n); + nextAscii = src + n; + return false; + } + + // pack + _mm_storeu_si128((__m128i*)dst, packed); + } + return src == end; +} + +static inline bool simdDecodeAscii(ushort *&dst, const uchar *&nextAscii, const uchar *&src, const uchar *end) +{ + // do sixteen characters at a time + for ( ; end - src >= 16; src += 16, dst += 16) { + __m128i data = _mm_loadu_si128((__m128i*)src); + + // check if everything is ASCII + // movemask extracts the high bit of every byte, so n is non-zero if something isn't ASCII + uint n = _mm_movemask_epi8(data); + if (n) { + // copy the front part that is still ASCII + while (!(n & 1)) { + *dst++ = *src++; + n >>= 1; + } + + // find the next probable ASCII character + // we don't want to load 16 bytes again in this loop if we know there are non-ASCII + // characters still coming + n = _bit_scan_reverse(n); + nextAscii = src + n; + return false; + } + + // unpack + _mm_storeu_si128((__m128i*)dst, _mm_unpacklo_epi8(data, _mm_setzero_si128())); + _mm_storeu_si128(1+(__m128i*)dst, _mm_unpackhi_epi8(data, _mm_setzero_si128())); + } + return src == end; +} +#else +static inline bool simdEncodeAscii(uchar *, const ushort *, const ushort *, const ushort *) +{ + return false; +} + +static inline bool simdDecodeAscii(ushort *, const uchar *, const uchar *, const uchar *) +{ + return false; +} +#endif + +QByteArray QUtf8::convertFromUnicode(const QChar *uc, int len) +{ + // create a QByteArray with the worst case scenario size + QByteArray result(len * 3, Qt::Uninitialized); + uchar *dst = reinterpret_cast<uchar *>(const_cast<char *>(result.constData())); + const ushort *src = reinterpret_cast<const ushort *>(uc); + const ushort *const end = src + len; + + while (src != end) { + const ushort *nextAscii = end; + if (simdEncodeAscii(dst, nextAscii, src, end)) + break; + + do { + ushort uc = *src++; + int res = QUtf8Functions::toUtf8<QUtf8BaseTraits>(uc, dst, src, end); + if (res < 0) { + // encoding error - append '?' + *dst++ = '?'; + } + } while (src < nextAscii); + } + + result.truncate(dst - reinterpret_cast<uchar *>(const_cast<char *>(result.constData()))); + return result; +} + QByteArray QUtf8::convertFromUnicode(const QChar *uc, int len, QTextCodec::ConverterState *state) { uchar replacement = '?'; @@ -62,61 +177,46 @@ QByteArray QUtf8::convertFromUnicode(const QChar *uc, int len, QTextCodec::Conve surrogate_high = state->state_data[0]; } - QByteArray rstr; - rstr.resize(rlen); - uchar* cursor = (uchar*)rstr.data(); - const QChar *ch = uc; + + QByteArray rstr(rlen, Qt::Uninitialized); + uchar *cursor = reinterpret_cast<uchar *>(const_cast<char *>(rstr.constData())); + const ushort *src = reinterpret_cast<const ushort *>(uc); + const ushort *const end = src + len; + int invalid = 0; if (state && !(state->flags & QTextCodec::IgnoreHeader)) { + // append UTF-8 BOM *cursor++ = 0xef; *cursor++ = 0xbb; *cursor++ = 0xbf; } - const QChar *end = ch + len; - while (ch < end) { - uint u = ch->unicode(); - if (surrogate_high >= 0) { - if (ch->isLowSurrogate()) { - u = QChar::surrogateToUcs4(surrogate_high, u); - surrogate_high = -1; - } else { - // high surrogate without low - *cursor = replacement; - ++ch; - ++invalid; - surrogate_high = -1; - continue; - } - } else if (ch->isLowSurrogate()) { - // low surrogate without high - *cursor = replacement; - ++ch; - ++invalid; - continue; - } else if (ch->isHighSurrogate()) { - surrogate_high = u; - ++ch; - continue; + const ushort *nextAscii = src; + while (src != end) { + int res; + ushort uc; + if (surrogate_high != -1) { + uc = surrogate_high; + surrogate_high = -1; + res = QUtf8Functions::toUtf8<QUtf8BaseTraits>(uc, cursor, src, end); + } else { + if (src >= nextAscii && simdEncodeAscii(cursor, nextAscii, src, end)) + break; + + uc = *src++; + res = QUtf8Functions::toUtf8<QUtf8BaseTraits>(uc, cursor, src, end); } + if (Q_LIKELY(res >= 0)) + continue; - if (u < 0x80) { - *cursor++ = (uchar)u; - } else { - if (u < 0x0800) { - *cursor++ = 0xc0 | ((uchar) (u >> 6)); - } else { - if (QChar::requiresSurrogates(u)) { - *cursor++ = 0xf0 | ((uchar) (u >> 18)); - *cursor++ = 0x80 | (((uchar) (u >> 12)) & 0x3f); - } else { - *cursor++ = 0xe0 | (((uchar) (u >> 12)) & 0x3f); - } - *cursor++ = 0x80 | (((uchar) (u >> 6)) & 0x3f); - } - *cursor++ = 0x80 | ((uchar) (u&0x3f)); + if (res == QUtf8BaseTraits::Error) { + // encoding error + ++invalid; + *cursor++ = replacement; + } else if (res == QUtf8BaseTraits::EndOfString) { + surrogate_high = uc; + break; } - ++ch; } rstr.resize(cursor - (const uchar*)rstr.constData()); @@ -132,114 +232,127 @@ QByteArray QUtf8::convertFromUnicode(const QChar *uc, int len, QTextCodec::Conve return rstr; } +QString QUtf8::convertToUnicode(const char *chars, int len) +{ + QString result(len + 1, Qt::Uninitialized); // worst case + ushort *dst = reinterpret_cast<ushort *>(const_cast<QChar *>(result.constData())); + const uchar *src = reinterpret_cast<const uchar *>(chars); + const uchar *end = src + len; + + while (src < end) { + const uchar *nextAscii = end; + if (simdDecodeAscii(dst, nextAscii, src, end)) + break; + + do { + uchar b = *src++; + int res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(b, dst, src, end); + if (res < 0) { + // decoding error + *dst++ = QChar::ReplacementCharacter; + } + } while (src < nextAscii); + } + + result.truncate(dst - reinterpret_cast<const ushort *>(result.constData())); + return result; +} + QString QUtf8::convertToUnicode(const char *chars, int len, QTextCodec::ConverterState *state) { bool headerdone = false; ushort replacement = QChar::ReplacementCharacter; int need = 0; - int error = -1; - uint uc = 0; - uint min_uc = 0; + int invalid = 0; + int res; + uchar ch = 0; + + QString result(need + len + 1, Qt::Uninitialized); // worst case + ushort *dst = reinterpret_cast<ushort *>(const_cast<QChar *>(result.constData())); + const uchar *src = reinterpret_cast<const uchar *>(chars); + const uchar *end = src + len; + if (state) { if (state->flags & QTextCodec::IgnoreHeader) headerdone = true; if (state->flags & QTextCodec::ConvertInvalidToNull) replacement = QChar::Null; - need = state->remainingChars; - if (need) { - uc = state->state_data[0]; - min_uc = state->state_data[1]; - } - } - if (!headerdone && len > 3 - && (uchar)chars[0] == 0xef && (uchar)chars[1] == 0xbb && (uchar)chars[2] == 0xbf) { - // starts with a byte order mark - chars += 3; - len -= 3; - headerdone = true; - } - - QString result(need + len + 1, Qt::Uninitialized); // worst case - ushort *qch = (ushort *)result.unicode(); - uchar ch; - int invalid = 0; - - for (int i = 0; i < len; ++i) { - ch = chars[i]; - if (need) { - if ((ch&0xc0) == 0x80) { - uc = (uc << 6) | (ch & 0x3f); - --need; - if (!need) { - // utf-8 bom composes into 0xfeff code point - if (!headerdone && uc == 0xfeff) { - // don't do anything, just skip the BOM - } else if (QChar::requiresSurrogates(uc) && uc <= QChar::LastValidCodePoint) { - // surrogate pair - Q_ASSERT((qch - (ushort*)result.unicode()) + 2 < result.length()); - *qch++ = QChar::highSurrogate(uc); - *qch++ = QChar::lowSurrogate(uc); - } else if ((uc < min_uc) || QChar::isSurrogate(uc) || uc > QChar::LastValidCodePoint) { - // error: overlong sequence, UTF16 surrogate or non-character - *qch++ = replacement; - ++invalid; - } else { - *qch++ = uc; - } - headerdone = true; - } - } else { - // error - i = error; - *qch++ = replacement; - ++invalid; - need = 0; - headerdone = true; - } - } else { - if (ch < 128) { - *qch++ = ushort(ch); - headerdone = true; - } else if ((ch & 0xe0) == 0xc0) { - uc = ch & 0x1f; - need = 1; - error = i; - min_uc = 0x80; - headerdone = true; - } else if ((ch & 0xf0) == 0xe0) { - uc = ch & 0x0f; - need = 2; - error = i; - min_uc = 0x800; - } else if ((ch&0xf8) == 0xf0) { - uc = ch & 0x07; - need = 3; - error = i; - min_uc = 0x10000; - headerdone = true; - } else { - // error - *qch++ = replacement; + if (state->remainingChars) { + // handle incoming state first + uchar remainingCharsData[4]; // longest UTF-8 sequence possible + int remainingCharsCount = state->remainingChars; + int newCharsToCopy = qMin<int>(sizeof(remainingCharsData) - remainingCharsCount, end - src); + + memset(remainingCharsData, 0, sizeof(remainingCharsData)); + memcpy(remainingCharsData, &state->state_data[0], remainingCharsCount); + memcpy(remainingCharsData + remainingCharsCount, src, newCharsToCopy); + + const uchar *begin = &remainingCharsData[1]; + res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(remainingCharsData[0], dst, begin, + static_cast<const uchar *>(remainingCharsData) + remainingCharsCount + newCharsToCopy); + if (res == QUtf8BaseTraits::EndOfString) { + // if we got EndOfString again, then there were too few bytes in src; + // copy to our state and return + state->remainingChars = remainingCharsCount + newCharsToCopy; + memcpy(&state->state_data[0], remainingCharsData, state->remainingChars); + return QString(); + } else if (res == QUtf8BaseTraits::Error) { ++invalid; + *dst++ = replacement; + } else if (!headerdone && res >= 0) { + // eat the UTF-8 BOM headerdone = true; + if (dst[-1] == 0xfeff) + --dst; } + + // adjust src now that we have maybe consumed a few chars + //Q_ASSERT(res > remainingCharsCount) + src += res - remainingCharsCount; } } - if (!state && need > 0) { - // unterminated UTF sequence - for (int i = error; i < len; ++i) { - *qch++ = replacement; + + // main body, stateless decoding + res = 0; + const uchar *nextAscii = src; + while (res >= 0 && src < end) { + if (src >= nextAscii && simdDecodeAscii(dst, nextAscii, src, end)) + break; + + ch = *src++; + res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(ch, dst, src, end); + if (!headerdone && res >= 0) { + headerdone = true; + // eat the UTF-8 BOM + if (dst[-1] == 0xfeff) + --dst; + } + if (res == QUtf8BaseTraits::Error) { + res = 0; ++invalid; + *dst++ = replacement; } } - result.truncate(qch - (ushort *)result.unicode()); + + if (!state && res == QUtf8BaseTraits::EndOfString) { + // unterminated UTF sequence + *dst++ = QChar::ReplacementCharacter; + while (src++ < end) + *dst++ = QChar::ReplacementCharacter; + } + + result.truncate(dst - (ushort *)result.unicode()); if (state) { state->invalidChars += invalid; - state->remainingChars = need; if (headerdone) state->flags |= QTextCodec::IgnoreHeader; - state->state_data[0] = need ? uc : 0; - state->state_data[1] = need ? min_uc : 0; + if (res == QUtf8BaseTraits::EndOfString) { + --src; // unread the byte in ch + state->remainingChars = end - src; + memcpy(&state->state_data[0], src, end - src); + } else { + state->remainingChars = 0; + } } return result; } |