summaryrefslogtreecommitdiffstats
path: root/src/corelib/codecs
diff options
context:
space:
mode:
Diffstat (limited to 'src/corelib/codecs')
-rw-r--r--src/corelib/codecs/qutfcodec.cpp175
-rw-r--r--src/corelib/codecs/qutfcodec_p.h105
2 files changed, 194 insertions, 86 deletions
diff --git a/src/corelib/codecs/qutfcodec.cpp b/src/corelib/codecs/qutfcodec.cpp
index d1fc5b851a..b0e0b3f010 100644
--- a/src/corelib/codecs/qutfcodec.cpp
+++ b/src/corelib/codecs/qutfcodec.cpp
@@ -128,114 +128,117 @@ QByteArray QUtf8::convertFromUnicode(const QChar *uc, int len, QTextCodec::Conve
return rstr;
}
+QString QUtf8::convertToUnicode(const char *chars, int len)
+{
+ QString result(len + 1, Qt::Uninitialized); // worst case
+ ushort *dst = reinterpret_cast<ushort *>(const_cast<QChar *>(result.constData()));
+ const uchar *src = reinterpret_cast<const uchar *>(chars);
+ const uchar *end = src + len;
+
+ while (src < end) {
+ uchar b = *src++;
+ int res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(b, dst, src, end);
+ if (res < 0) {
+ // decoding error
+ *dst++ = QChar::ReplacementCharacter;
+ }
+ }
+
+ result.truncate(dst - reinterpret_cast<const ushort *>(result.constData()));
+ return result;
+}
+
QString QUtf8::convertToUnicode(const char *chars, int len, QTextCodec::ConverterState *state)
{
bool headerdone = false;
ushort replacement = QChar::ReplacementCharacter;
int need = 0;
- int error = -1;
- uint uc = 0;
- uint min_uc = 0;
+ int invalid = 0;
+ int res;
+ uchar ch = 0;
+
+ QString result(need + len + 1, Qt::Uninitialized); // worst case
+ ushort *dst = reinterpret_cast<ushort *>(const_cast<QChar *>(result.constData()));
+ const uchar *src = reinterpret_cast<const uchar *>(chars);
+ const uchar *end = src + len;
+
if (state) {
if (state->flags & QTextCodec::IgnoreHeader)
headerdone = true;
if (state->flags & QTextCodec::ConvertInvalidToNull)
replacement = QChar::Null;
- need = state->remainingChars;
- if (need) {
- uc = state->state_data[0];
- min_uc = state->state_data[1];
- }
- }
- if (!headerdone && len > 3
- && (uchar)chars[0] == 0xef && (uchar)chars[1] == 0xbb && (uchar)chars[2] == 0xbf) {
- // starts with a byte order mark
- chars += 3;
- len -= 3;
- headerdone = true;
- }
-
- QString result(need + len + 1, Qt::Uninitialized); // worst case
- ushort *qch = (ushort *)result.unicode();
- uchar ch;
- int invalid = 0;
-
- for (int i = 0; i < len; ++i) {
- ch = chars[i];
- if (need) {
- if ((ch&0xc0) == 0x80) {
- uc = (uc << 6) | (ch & 0x3f);
- --need;
- if (!need) {
- // utf-8 bom composes into 0xfeff code point
- if (!headerdone && uc == 0xfeff) {
- // don't do anything, just skip the BOM
- } else if (QChar::requiresSurrogates(uc) && uc <= QChar::LastValidCodePoint) {
- // surrogate pair
- Q_ASSERT((qch - (ushort*)result.unicode()) + 2 < result.length());
- *qch++ = QChar::highSurrogate(uc);
- *qch++ = QChar::lowSurrogate(uc);
- } else if ((uc < min_uc) || QChar::isSurrogate(uc) || uc > QChar::LastValidCodePoint) {
- // error: overlong sequence, UTF16 surrogate or non-character
- *qch++ = replacement;
- ++invalid;
- } else {
- *qch++ = uc;
- }
- headerdone = true;
- }
- } else {
- // error
- i = error;
- *qch++ = replacement;
- ++invalid;
- need = 0;
- headerdone = true;
- }
- } else {
- if (ch < 128) {
- *qch++ = ushort(ch);
- headerdone = true;
- } else if ((ch & 0xe0) == 0xc0) {
- uc = ch & 0x1f;
- need = 1;
- error = i;
- min_uc = 0x80;
- headerdone = true;
- } else if ((ch & 0xf0) == 0xe0) {
- uc = ch & 0x0f;
- need = 2;
- error = i;
- min_uc = 0x800;
- } else if ((ch&0xf8) == 0xf0) {
- uc = ch & 0x07;
- need = 3;
- error = i;
- min_uc = 0x10000;
- headerdone = true;
- } else {
- // error
- *qch++ = replacement;
+ if (state->remainingChars) {
+ // handle incoming state first
+ uchar remainingCharsData[4]; // longest UTF-8 sequence possible
+ int remainingCharsCount = state->remainingChars;
+ int newCharsToCopy = qMin<int>(sizeof(remainingCharsData) - remainingCharsCount, end - src);
+
+ memset(remainingCharsData, 0, sizeof(remainingCharsData));
+ memcpy(remainingCharsData, &state->state_data[0], remainingCharsCount);
+ memcpy(remainingCharsData + remainingCharsCount, src, newCharsToCopy);
+
+ const uchar *begin = &remainingCharsData[1];
+ res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(remainingCharsData[0], dst, begin,
+ static_cast<const uchar *>(remainingCharsData) + remainingCharsCount + newCharsToCopy);
+ if (res == QUtf8BaseTraits::EndOfString) {
+ // if we got EndOfString again, then there were too few bytes in src;
+ // copy to our state and return
+ state->remainingChars = remainingCharsCount + newCharsToCopy;
+ memcpy(&state->state_data[0], remainingCharsData, state->remainingChars);
+ return QString();
+ } else if (res == QUtf8BaseTraits::Error) {
++invalid;
+ *dst++ = replacement;
+ } else if (!headerdone && res >= 0) {
+ // eat the UTF-8 BOM
headerdone = true;
+ if (dst[-1] == 0xfeff)
+ --dst;
}
+
+ // adjust src now that we have maybe consumed a few chars
+ //Q_ASSERT(res > remainingCharsCount)
+ src += res - remainingCharsCount;
}
}
- if (!state && need > 0) {
- // unterminated UTF sequence
- for (int i = error; i < len; ++i) {
- *qch++ = replacement;
+
+ // main body, stateless decoding
+ res = 0;
+ while (res >= 0 && src < end) {
+ ch = *src++;
+ res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(ch, dst, src, end);
+ if (!headerdone && res >= 0) {
+ headerdone = true;
+ // eat the UTF-8 BOM
+ if (dst[-1] == 0xfeff)
+ --dst;
+ }
+ if (res == QUtf8BaseTraits::Error) {
+ res = 0;
++invalid;
+ *dst++ = replacement;
}
}
- result.truncate(qch - (ushort *)result.unicode());
+
+ if (!state && res == QUtf8BaseTraits::EndOfString) {
+ // unterminated UTF sequence
+ *dst++ = QChar::ReplacementCharacter;
+ while (src++ < end)
+ *dst++ = QChar::ReplacementCharacter;
+ }
+
+ result.truncate(dst - (ushort *)result.unicode());
if (state) {
state->invalidChars += invalid;
- state->remainingChars = need;
if (headerdone)
state->flags |= QTextCodec::IgnoreHeader;
- state->state_data[0] = need ? uc : 0;
- state->state_data[1] = need ? min_uc : 0;
+ if (res == QUtf8BaseTraits::EndOfString) {
+ --src; // unread the byte in ch
+ state->remainingChars = end - src;
+ memcpy(&state->state_data[0], src, end - src);
+ } else {
+ state->remainingChars = 0;
+ }
}
return result;
}
diff --git a/src/corelib/codecs/qutfcodec_p.h b/src/corelib/codecs/qutfcodec_p.h
index 4f0e2394fe..c94a7a12e4 100644
--- a/src/corelib/codecs/qutfcodec_p.h
+++ b/src/corelib/codecs/qutfcodec_p.h
@@ -169,6 +169,110 @@ namespace QUtf8Functions
Traits::appendByte(dst, 0x80 | (u & 0x3f));
return 0;
}
+
+ inline bool isContinuationByte(uchar b)
+ {
+ return (b & 0xc0) == 0x80;
+ }
+
+ /// returns the number of characters consumed (including \a b) in case of success;
+ /// returns negative in case of error: Traits::Error or Traits::EndOfString
+ template <typename Traits, typename OutputPtr, typename InputPtr> inline
+ int fromUtf8(uchar b, OutputPtr &dst, InputPtr &src, InputPtr end)
+ {
+ int charsNeeded;
+ uint min_uc;
+ uint uc;
+
+ if (!Traits::skipAsciiHandling && b < 0x80) {
+ // US-ASCII
+ Traits::appendUtf16(dst, b);
+ return 1;
+ }
+
+ if (!Traits::isTrusted && Q_UNLIKELY(b <= 0xC1)) {
+ // an UTF-8 first character must be at least 0xC0
+ // however, all 0xC0 and 0xC1 first bytes can only produce overlong sequences
+ return Traits::Error;
+ } else if (b < 0xe0) {
+ charsNeeded = 2;
+ min_uc = 0x80;
+ uc = b & 0x1f;
+ } else if (b < 0xf0) {
+ charsNeeded = 3;
+ min_uc = 0x800;
+ uc = b & 0x0f;
+ } else if (b < 0xf5) {
+ charsNeeded = 4;
+ min_uc = 0x10000;
+ uc = b & 0x07;
+ } else {
+ // the last Unicode character is U+10FFFF
+ // it's encoded in UTF-8 as "\xF4\x8F\xBF\xBF"
+ // therefore, a byte higher than 0xF4 is not the UTF-8 first byte
+ return Traits::Error;
+ }
+
+ int bytesAvailable = Traits::availableBytes(src, end);
+ if (Q_UNLIKELY(bytesAvailable < charsNeeded - 1)) {
+ // it's possible that we have an error instead of just unfinished bytes
+ if (bytesAvailable > 0 && !isContinuationByte(Traits::peekByte(src, 0)))
+ return Traits::Error;
+ if (bytesAvailable > 1 && !isContinuationByte(Traits::peekByte(src, 1)))
+ return Traits::Error;
+ if (bytesAvailable > 2 && !isContinuationByte(Traits::peekByte(src, 2)))
+ return Traits::Error;
+ return Traits::EndOfString;
+ }
+
+ // first continuation character
+ b = Traits::peekByte(src, 0);
+ if (!isContinuationByte(b))
+ return Traits::Error;
+ uc <<= 6;
+ uc |= b & 0x3f;
+
+ if (charsNeeded > 2) {
+ // second continuation character
+ b = Traits::peekByte(src, 1);
+ if (!isContinuationByte(b))
+ return Traits::Error;
+ uc <<= 6;
+ uc |= b & 0x3f;
+
+ if (charsNeeded > 3) {
+ // third continuation character
+ b = Traits::peekByte(src, 2);
+ if (!isContinuationByte(b))
+ return Traits::Error;
+ uc <<= 6;
+ uc |= b & 0x3f;
+ }
+ }
+
+ // we've decoded something; safety-check it
+ if (!Traits::isTrusted) {
+ if (uc < min_uc)
+ return Traits::Error;
+ if (QChar::isSurrogate(uc) || uc > QChar::LastValidCodePoint)
+ return Traits::Error;
+ if (!Traits::allowNonCharacters && QChar::isNonCharacter(uc))
+ return Traits::Error;
+ }
+
+ // write the UTF-16 sequence
+ if (!QChar::requiresSurrogates(uc)) {
+ // UTF-8 decoded and no surrogates are required
+ // detach if necessary
+ Traits::appendUtf16(dst, ushort(uc));
+ } else {
+ // UTF-8 decoded to something that requires a surrogate pair
+ Traits::appendUcs4(dst, uc);
+ }
+
+ Traits::advanceByte(src, charsNeeded - 1);
+ return charsNeeded;
+ }
}
enum DataEndianness
@@ -180,6 +284,7 @@ enum DataEndianness
struct QUtf8
{
+ static QString convertToUnicode(const char *, int);
static QString convertToUnicode(const char *, int, QTextCodec::ConverterState *);
static QByteArray convertFromUnicode(const QChar *, int);
static QByteArray convertFromUnicode(const QChar *, int, QTextCodec::ConverterState *);