summaryrefslogtreecommitdiffstats
path: root/src/corelib/codecs
diff options
context:
space:
mode:
Diffstat (limited to 'src/corelib/codecs')
-rw-r--r--src/corelib/codecs/qutfcodec.cpp86
-rw-r--r--src/corelib/codecs/qutfcodec_p.h114
2 files changed, 155 insertions, 45 deletions
diff --git a/src/corelib/codecs/qutfcodec.cpp b/src/corelib/codecs/qutfcodec.cpp
index e425f8634c..d1fc5b851a 100644
--- a/src/corelib/codecs/qutfcodec.cpp
+++ b/src/corelib/codecs/qutfcodec.cpp
@@ -1,6 +1,7 @@
/****************************************************************************
**
** Copyright (C) 2013 Digia Plc and/or its subsidiary(-ies).
+** Copyright (C) 2013 Intel Corporation
** Contact: http://www.qt-project.org/legal
**
** This file is part of the QtCore module of the Qt Toolkit.
@@ -48,6 +49,27 @@ QT_BEGIN_NAMESPACE
enum { Endian = 0, Data = 1 };
+QByteArray QUtf8::convertFromUnicode(const QChar *uc, int len)
+{
+ // create a QByteArray with the worst case scenario size
+ QByteArray result(len * 3, Qt::Uninitialized);
+ uchar *dst = reinterpret_cast<uchar *>(const_cast<char *>(result.constData()));
+ const ushort *src = reinterpret_cast<const ushort *>(uc);
+ const ushort *const end = src + len;
+
+ while (src != end) {
+ ushort uc = *src++;
+ int res = QUtf8Functions::toUtf8<QUtf8BaseTraits>(uc, dst, src, end);
+ if (res < 0) {
+ // encoding error - append '?'
+ *dst++ = '?';
+ }
+ }
+
+ result.truncate(dst - reinterpret_cast<uchar *>(const_cast<char *>(result.constData())));
+ return result;
+}
+
QByteArray QUtf8::convertFromUnicode(const QChar *uc, int len, QTextCodec::ConverterState *state)
{
uchar replacement = '?';
@@ -62,61 +84,35 @@ QByteArray QUtf8::convertFromUnicode(const QChar *uc, int len, QTextCodec::Conve
surrogate_high = state->state_data[0];
}
- QByteArray rstr;
- rstr.resize(rlen);
- uchar* cursor = (uchar*)rstr.data();
- const QChar *ch = uc;
+
+ QByteArray rstr(rlen, Qt::Uninitialized);
+ uchar *cursor = reinterpret_cast<uchar *>(const_cast<char *>(rstr.constData()));
+ const ushort *src = reinterpret_cast<const ushort *>(uc);
+ const ushort *const end = src + len;
+
int invalid = 0;
if (state && !(state->flags & QTextCodec::IgnoreHeader)) {
+ // append UTF-8 BOM
*cursor++ = 0xef;
*cursor++ = 0xbb;
*cursor++ = 0xbf;
}
- const QChar *end = ch + len;
- while (ch < end) {
- uint u = ch->unicode();
- if (surrogate_high >= 0) {
- if (ch->isLowSurrogate()) {
- u = QChar::surrogateToUcs4(surrogate_high, u);
- surrogate_high = -1;
- } else {
- // high surrogate without low
- *cursor = replacement;
- ++ch;
- ++invalid;
- surrogate_high = -1;
- continue;
- }
- } else if (ch->isLowSurrogate()) {
- // low surrogate without high
- *cursor = replacement;
- ++ch;
- ++invalid;
- continue;
- } else if (ch->isHighSurrogate()) {
- surrogate_high = u;
- ++ch;
+ while (src != end) {
+ ushort uc = surrogate_high == -1 ? *src++ : surrogate_high;
+ surrogate_high = -1;
+ int res = QUtf8Functions::toUtf8<QUtf8BaseTraits>(uc, cursor, src, end);
+ if (Q_LIKELY(res >= 0))
continue;
- }
- if (u < 0x80) {
- *cursor++ = (uchar)u;
- } else {
- if (u < 0x0800) {
- *cursor++ = 0xc0 | ((uchar) (u >> 6));
- } else {
- if (QChar::requiresSurrogates(u)) {
- *cursor++ = 0xf0 | ((uchar) (u >> 18));
- *cursor++ = 0x80 | (((uchar) (u >> 12)) & 0x3f);
- } else {
- *cursor++ = 0xe0 | (((uchar) (u >> 12)) & 0x3f);
- }
- *cursor++ = 0x80 | (((uchar) (u >> 6)) & 0x3f);
- }
- *cursor++ = 0x80 | ((uchar) (u&0x3f));
+ if (res == QUtf8BaseTraits::Error) {
+ // encoding error
+ ++invalid;
+ *cursor++ = replacement;
+ } else if (res == QUtf8BaseTraits::EndOfString) {
+ surrogate_high = uc;
+ break;
}
- ++ch;
}
rstr.resize(cursor - (const uchar*)rstr.constData());
diff --git a/src/corelib/codecs/qutfcodec_p.h b/src/corelib/codecs/qutfcodec_p.h
index e1214d50bc..4f0e2394fe 100644
--- a/src/corelib/codecs/qutfcodec_p.h
+++ b/src/corelib/codecs/qutfcodec_p.h
@@ -1,6 +1,7 @@
/****************************************************************************
**
** Copyright (C) 2013 Digia Plc and/or its subsidiary(-ies).
+** Copyright (C) 2013 Intel Corporation
** Contact: http://www.qt-project.org/legal
**
** This file is part of the QtCore module of the Qt Toolkit.
@@ -58,6 +59,118 @@
QT_BEGIN_NAMESPACE
+struct QUtf8BaseTraits
+{
+ static const bool isTrusted = false;
+ static const bool allowNonCharacters = true;
+ static const bool skipAsciiHandling = false;
+ static const int Error = -1;
+ static const int EndOfString = -2;
+
+ static bool isValidCharacter(uint u)
+ { return int(u) >= 0; }
+
+ static void appendByte(uchar *&ptr, uchar b)
+ { *ptr++ = b; }
+
+ static uchar peekByte(const uchar *ptr, int n = 0)
+ { return ptr[n]; }
+
+ static qptrdiff availableBytes(const uchar *ptr, const uchar *end)
+ { return end - ptr; }
+
+ static void advanceByte(const uchar *&ptr, int n = 1)
+ { ptr += n; }
+
+ static void appendUtf16(ushort *&ptr, ushort uc)
+ { *ptr++ = uc; }
+
+ static void appendUcs4(ushort *&ptr, uint uc)
+ {
+ appendUtf16(ptr, QChar::highSurrogate(uc));
+ appendUtf16(ptr, QChar::lowSurrogate(uc));
+ }
+
+ static ushort peekUtf16(const ushort *ptr, int n = 0)
+ { return ptr[n]; }
+
+ static qptrdiff availableUtf16(const ushort *ptr, const ushort *end)
+ { return end - ptr; }
+
+ static void advanceUtf16(const ushort *&ptr, int n = 1)
+ { ptr += n; }
+
+ // it's possible to output to UCS-4 too
+ static void appendUtf16(uint *&ptr, ushort uc)
+ { *ptr++ = uc; }
+
+ static void appendUcs4(uint *&ptr, uint uc)
+ { *ptr++ = uc; }
+};
+
+namespace QUtf8Functions
+{
+ /// returns 0 on success; errors can only happen if \a u is a surrogate:
+ /// Error if \a u is a low surrogate;
+ /// if \a u is a high surrogate, Error if the next isn't a low one,
+ /// EndOfString if we run into the end of the string.
+ template <typename Traits, typename OutputPtr, typename InputPtr> inline
+ int toUtf8(ushort u, OutputPtr &dst, InputPtr &src, InputPtr end)
+ {
+ if (!Traits::skipAsciiHandling && u < 0x80) {
+ // U+0000 to U+007F (US-ASCII) - one byte
+ Traits::appendByte(dst, uchar(u));
+ return 0;
+ } else if (u < 0x0800) {
+ // U+0080 to U+07FF - two bytes
+ // first of two bytes
+ Traits::appendByte(dst, 0xc0 | uchar(u >> 6));
+ } else {
+ if (!QChar::isSurrogate(u)) {
+ // U+0800 to U+FFFF (except U+D800-U+DFFF) - three bytes
+ if (!Traits::allowNonCharacters && QChar::isNonCharacter(u))
+ return Traits::Error;
+
+ // first of three bytes
+ Traits::appendByte(dst, 0xe0 | uchar(u >> 12));
+ } else {
+ // U+10000 to U+10FFFF - four bytes
+ // need to get one extra codepoint
+ if (Traits::availableUtf16(src, end) == 0)
+ return Traits::EndOfString;
+
+ ushort low = Traits::peekUtf16(src);
+ if (!QChar::isHighSurrogate(u))
+ return Traits::Error;
+ if (!QChar::isLowSurrogate(low))
+ return Traits::Error;
+
+ Traits::advanceUtf16(src);
+ uint ucs4 = QChar::surrogateToUcs4(u, low);
+
+ if (!Traits::allowNonCharacters && QChar::isNonCharacter(ucs4))
+ return Traits::Error;
+
+ // first byte
+ Traits::appendByte(dst, 0xf0 | (uchar(ucs4 >> 18) & 0xf));
+
+ // second of four bytes
+ Traits::appendByte(dst, 0x80 | (uchar(ucs4 >> 12) & 0x3f));
+
+ // for the rest of the bytes
+ u = ushort(ucs4);
+ }
+
+ // second to last byte
+ Traits::appendByte(dst, 0x80 | (uchar(u >> 6) & 0x3f));
+ }
+
+ // last byte
+ Traits::appendByte(dst, 0x80 | (u & 0x3f));
+ return 0;
+ }
+}
+
enum DataEndianness
{
DetectEndianness,
@@ -68,6 +181,7 @@ enum DataEndianness
struct QUtf8
{
static QString convertToUnicode(const char *, int, QTextCodec::ConverterState *);
+ static QByteArray convertFromUnicode(const QChar *, int);
static QByteArray convertFromUnicode(const QChar *, int, QTextCodec::ConverterState *);
};