summaryrefslogtreecommitdiffstats
path: root/src/corelib/codecs/qutfcodec.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'src/corelib/codecs/qutfcodec.cpp')
-rw-r--r--src/corelib/codecs/qutfcodec.cpp261
1 files changed, 130 insertions, 131 deletions
diff --git a/src/corelib/codecs/qutfcodec.cpp b/src/corelib/codecs/qutfcodec.cpp
index e425f8634c..b0e0b3f010 100644
--- a/src/corelib/codecs/qutfcodec.cpp
+++ b/src/corelib/codecs/qutfcodec.cpp
@@ -1,6 +1,7 @@
/****************************************************************************
**
** Copyright (C) 2013 Digia Plc and/or its subsidiary(-ies).
+** Copyright (C) 2013 Intel Corporation
** Contact: http://www.qt-project.org/legal
**
** This file is part of the QtCore module of the Qt Toolkit.
@@ -48,6 +49,27 @@ QT_BEGIN_NAMESPACE
enum { Endian = 0, Data = 1 };
+QByteArray QUtf8::convertFromUnicode(const QChar *uc, int len)
+{
+ // create a QByteArray with the worst case scenario size
+ QByteArray result(len * 3, Qt::Uninitialized);
+ uchar *dst = reinterpret_cast<uchar *>(const_cast<char *>(result.constData()));
+ const ushort *src = reinterpret_cast<const ushort *>(uc);
+ const ushort *const end = src + len;
+
+ while (src != end) {
+ ushort uc = *src++;
+ int res = QUtf8Functions::toUtf8<QUtf8BaseTraits>(uc, dst, src, end);
+ if (res < 0) {
+ // encoding error - append '?'
+ *dst++ = '?';
+ }
+ }
+
+ result.truncate(dst - reinterpret_cast<uchar *>(const_cast<char *>(result.constData())));
+ return result;
+}
+
QByteArray QUtf8::convertFromUnicode(const QChar *uc, int len, QTextCodec::ConverterState *state)
{
uchar replacement = '?';
@@ -62,61 +84,35 @@ QByteArray QUtf8::convertFromUnicode(const QChar *uc, int len, QTextCodec::Conve
surrogate_high = state->state_data[0];
}
- QByteArray rstr;
- rstr.resize(rlen);
- uchar* cursor = (uchar*)rstr.data();
- const QChar *ch = uc;
+
+ QByteArray rstr(rlen, Qt::Uninitialized);
+ uchar *cursor = reinterpret_cast<uchar *>(const_cast<char *>(rstr.constData()));
+ const ushort *src = reinterpret_cast<const ushort *>(uc);
+ const ushort *const end = src + len;
+
int invalid = 0;
if (state && !(state->flags & QTextCodec::IgnoreHeader)) {
+ // append UTF-8 BOM
*cursor++ = 0xef;
*cursor++ = 0xbb;
*cursor++ = 0xbf;
}
- const QChar *end = ch + len;
- while (ch < end) {
- uint u = ch->unicode();
- if (surrogate_high >= 0) {
- if (ch->isLowSurrogate()) {
- u = QChar::surrogateToUcs4(surrogate_high, u);
- surrogate_high = -1;
- } else {
- // high surrogate without low
- *cursor = replacement;
- ++ch;
- ++invalid;
- surrogate_high = -1;
- continue;
- }
- } else if (ch->isLowSurrogate()) {
- // low surrogate without high
- *cursor = replacement;
- ++ch;
- ++invalid;
+ while (src != end) {
+ ushort uc = surrogate_high == -1 ? *src++ : surrogate_high;
+ surrogate_high = -1;
+ int res = QUtf8Functions::toUtf8<QUtf8BaseTraits>(uc, cursor, src, end);
+ if (Q_LIKELY(res >= 0))
continue;
- } else if (ch->isHighSurrogate()) {
- surrogate_high = u;
- ++ch;
- continue;
- }
- if (u < 0x80) {
- *cursor++ = (uchar)u;
- } else {
- if (u < 0x0800) {
- *cursor++ = 0xc0 | ((uchar) (u >> 6));
- } else {
- if (QChar::requiresSurrogates(u)) {
- *cursor++ = 0xf0 | ((uchar) (u >> 18));
- *cursor++ = 0x80 | (((uchar) (u >> 12)) & 0x3f);
- } else {
- *cursor++ = 0xe0 | (((uchar) (u >> 12)) & 0x3f);
- }
- *cursor++ = 0x80 | (((uchar) (u >> 6)) & 0x3f);
- }
- *cursor++ = 0x80 | ((uchar) (u&0x3f));
+ if (res == QUtf8BaseTraits::Error) {
+ // encoding error
+ ++invalid;
+ *cursor++ = replacement;
+ } else if (res == QUtf8BaseTraits::EndOfString) {
+ surrogate_high = uc;
+ break;
}
- ++ch;
}
rstr.resize(cursor - (const uchar*)rstr.constData());
@@ -132,114 +128,117 @@ QByteArray QUtf8::convertFromUnicode(const QChar *uc, int len, QTextCodec::Conve
return rstr;
}
+QString QUtf8::convertToUnicode(const char *chars, int len)
+{
+ QString result(len + 1, Qt::Uninitialized); // worst case
+ ushort *dst = reinterpret_cast<ushort *>(const_cast<QChar *>(result.constData()));
+ const uchar *src = reinterpret_cast<const uchar *>(chars);
+ const uchar *end = src + len;
+
+ while (src < end) {
+ uchar b = *src++;
+ int res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(b, dst, src, end);
+ if (res < 0) {
+ // decoding error
+ *dst++ = QChar::ReplacementCharacter;
+ }
+ }
+
+ result.truncate(dst - reinterpret_cast<const ushort *>(result.constData()));
+ return result;
+}
+
QString QUtf8::convertToUnicode(const char *chars, int len, QTextCodec::ConverterState *state)
{
bool headerdone = false;
ushort replacement = QChar::ReplacementCharacter;
int need = 0;
- int error = -1;
- uint uc = 0;
- uint min_uc = 0;
+ int invalid = 0;
+ int res;
+ uchar ch = 0;
+
+ QString result(need + len + 1, Qt::Uninitialized); // worst case
+ ushort *dst = reinterpret_cast<ushort *>(const_cast<QChar *>(result.constData()));
+ const uchar *src = reinterpret_cast<const uchar *>(chars);
+ const uchar *end = src + len;
+
if (state) {
if (state->flags & QTextCodec::IgnoreHeader)
headerdone = true;
if (state->flags & QTextCodec::ConvertInvalidToNull)
replacement = QChar::Null;
- need = state->remainingChars;
- if (need) {
- uc = state->state_data[0];
- min_uc = state->state_data[1];
- }
- }
- if (!headerdone && len > 3
- && (uchar)chars[0] == 0xef && (uchar)chars[1] == 0xbb && (uchar)chars[2] == 0xbf) {
- // starts with a byte order mark
- chars += 3;
- len -= 3;
- headerdone = true;
- }
-
- QString result(need + len + 1, Qt::Uninitialized); // worst case
- ushort *qch = (ushort *)result.unicode();
- uchar ch;
- int invalid = 0;
-
- for (int i = 0; i < len; ++i) {
- ch = chars[i];
- if (need) {
- if ((ch&0xc0) == 0x80) {
- uc = (uc << 6) | (ch & 0x3f);
- --need;
- if (!need) {
- // utf-8 bom composes into 0xfeff code point
- if (!headerdone && uc == 0xfeff) {
- // don't do anything, just skip the BOM
- } else if (QChar::requiresSurrogates(uc) && uc <= QChar::LastValidCodePoint) {
- // surrogate pair
- Q_ASSERT((qch - (ushort*)result.unicode()) + 2 < result.length());
- *qch++ = QChar::highSurrogate(uc);
- *qch++ = QChar::lowSurrogate(uc);
- } else if ((uc < min_uc) || QChar::isSurrogate(uc) || uc > QChar::LastValidCodePoint) {
- // error: overlong sequence, UTF16 surrogate or non-character
- *qch++ = replacement;
- ++invalid;
- } else {
- *qch++ = uc;
- }
- headerdone = true;
- }
- } else {
- // error
- i = error;
- *qch++ = replacement;
- ++invalid;
- need = 0;
- headerdone = true;
- }
- } else {
- if (ch < 128) {
- *qch++ = ushort(ch);
- headerdone = true;
- } else if ((ch & 0xe0) == 0xc0) {
- uc = ch & 0x1f;
- need = 1;
- error = i;
- min_uc = 0x80;
- headerdone = true;
- } else if ((ch & 0xf0) == 0xe0) {
- uc = ch & 0x0f;
- need = 2;
- error = i;
- min_uc = 0x800;
- } else if ((ch&0xf8) == 0xf0) {
- uc = ch & 0x07;
- need = 3;
- error = i;
- min_uc = 0x10000;
- headerdone = true;
- } else {
- // error
- *qch++ = replacement;
+ if (state->remainingChars) {
+ // handle incoming state first
+ uchar remainingCharsData[4]; // longest UTF-8 sequence possible
+ int remainingCharsCount = state->remainingChars;
+ int newCharsToCopy = qMin<int>(sizeof(remainingCharsData) - remainingCharsCount, end - src);
+
+ memset(remainingCharsData, 0, sizeof(remainingCharsData));
+ memcpy(remainingCharsData, &state->state_data[0], remainingCharsCount);
+ memcpy(remainingCharsData + remainingCharsCount, src, newCharsToCopy);
+
+ const uchar *begin = &remainingCharsData[1];
+ res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(remainingCharsData[0], dst, begin,
+ static_cast<const uchar *>(remainingCharsData) + remainingCharsCount + newCharsToCopy);
+ if (res == QUtf8BaseTraits::EndOfString) {
+ // if we got EndOfString again, then there were too few bytes in src;
+ // copy to our state and return
+ state->remainingChars = remainingCharsCount + newCharsToCopy;
+ memcpy(&state->state_data[0], remainingCharsData, state->remainingChars);
+ return QString();
+ } else if (res == QUtf8BaseTraits::Error) {
++invalid;
+ *dst++ = replacement;
+ } else if (!headerdone && res >= 0) {
+ // eat the UTF-8 BOM
headerdone = true;
+ if (dst[-1] == 0xfeff)
+ --dst;
}
+
+ // adjust src now that we have maybe consumed a few chars
+ //Q_ASSERT(res > remainingCharsCount)
+ src += res - remainingCharsCount;
}
}
- if (!state && need > 0) {
- // unterminated UTF sequence
- for (int i = error; i < len; ++i) {
- *qch++ = replacement;
+
+ // main body, stateless decoding
+ res = 0;
+ while (res >= 0 && src < end) {
+ ch = *src++;
+ res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(ch, dst, src, end);
+ if (!headerdone && res >= 0) {
+ headerdone = true;
+ // eat the UTF-8 BOM
+ if (dst[-1] == 0xfeff)
+ --dst;
+ }
+ if (res == QUtf8BaseTraits::Error) {
+ res = 0;
++invalid;
+ *dst++ = replacement;
}
}
- result.truncate(qch - (ushort *)result.unicode());
+
+ if (!state && res == QUtf8BaseTraits::EndOfString) {
+ // unterminated UTF sequence
+ *dst++ = QChar::ReplacementCharacter;
+ while (src++ < end)
+ *dst++ = QChar::ReplacementCharacter;
+ }
+
+ result.truncate(dst - (ushort *)result.unicode());
if (state) {
state->invalidChars += invalid;
- state->remainingChars = need;
if (headerdone)
state->flags |= QTextCodec::IgnoreHeader;
- state->state_data[0] = need ? uc : 0;
- state->state_data[1] = need ? min_uc : 0;
+ if (res == QUtf8BaseTraits::EndOfString) {
+ --src; // unread the byte in ch
+ state->remainingChars = end - src;
+ memcpy(&state->state_data[0], src, end - src);
+ } else {
+ state->remainingChars = 0;
+ }
}
return result;
}