summaryrefslogtreecommitdiffstats
path: root/src/corelib/codecs/qutfcodec.cpp
diff options
context:
space:
mode:
authorQt by Nokia <qt-info@nokia.com>2011-04-27 12:05:43 +0200
committeraxis <qt-info@nokia.com>2011-04-27 12:05:43 +0200
commit38be0d13830efd2d98281c645c3a60afe05ffece (patch)
tree6ea73f3ec77f7d153333779883e8120f82820abe /src/corelib/codecs/qutfcodec.cpp
Initial import from the monolithic Qt.
This is the beginning of revision history for this module. If you want to look at revision history older than this, please refer to the Qt Git wiki for how to use Git history grafting. At the time of writing, this wiki is located here: http://qt.gitorious.org/qt/pages/GitIntroductionWithQt If you have already performed the grafting and you don't see any history beyond this commit, try running "git log" with the "--follow" argument. Branched from the monolithic repo, Qt master branch, at commit 896db169ea224deb96c59ce8af800d019de63f12
Diffstat (limited to 'src/corelib/codecs/qutfcodec.cpp')
-rw-r--r--src/corelib/codecs/qutfcodec.cpp670
1 files changed, 670 insertions, 0 deletions
diff --git a/src/corelib/codecs/qutfcodec.cpp b/src/corelib/codecs/qutfcodec.cpp
new file mode 100644
index 0000000000..c44832936d
--- /dev/null
+++ b/src/corelib/codecs/qutfcodec.cpp
@@ -0,0 +1,670 @@
+/****************************************************************************
+**
+** Copyright (C) 2011 Nokia Corporation and/or its subsidiary(-ies).
+** All rights reserved.
+** Contact: Nokia Corporation (qt-info@nokia.com)
+**
+** This file is part of the QtCore module of the Qt Toolkit.
+**
+** $QT_BEGIN_LICENSE:LGPL$
+** No Commercial Usage
+** This file contains pre-release code and may not be distributed.
+** You may use this file in accordance with the terms and conditions
+** contained in the Technology Preview License Agreement accompanying
+** this package.
+**
+** GNU Lesser General Public License Usage
+** Alternatively, this file may be used under the terms of the GNU Lesser
+** General Public License version 2.1 as published by the Free Software
+** Foundation and appearing in the file LICENSE.LGPL included in the
+** packaging of this file. Please review the following information to
+** ensure the GNU Lesser General Public License version 2.1 requirements
+** will be met: http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html.
+**
+** In addition, as a special exception, Nokia gives you certain additional
+** rights. These rights are described in the Nokia Qt LGPL Exception
+** version 1.1, included in the file LGPL_EXCEPTION.txt in this package.
+**
+** If you have questions regarding the use of this file, please contact
+** Nokia at qt-info@nokia.com.
+**
+**
+**
+**
+**
+**
+**
+**
+** $QT_END_LICENSE$
+**
+****************************************************************************/
+
+#include "qutfcodec_p.h"
+#include "qlist.h"
+#include "qendian.h"
+#include "qchar.h"
+
+QT_BEGIN_NAMESPACE
+
+enum { Endian = 0, Data = 1 };
+
+static inline bool isUnicodeNonCharacter(uint ucs4)
+{
+ // Unicode has a couple of "non-characters" that one can use internally,
+ // but are not allowed to be used for text interchange.
+ //
+ // Those are the last two entries each Unicode Plane (U+FFFE, U+FFFF,
+ // U+1FFFE, U+1FFFF, etc.) as well as the entries between U+FDD0 and
+ // U+FDEF (inclusive)
+
+ return (ucs4 & 0xfffe) == 0xfffe
+ || (ucs4 - 0xfdd0U) < 16;
+}
+
+QByteArray QUtf8::convertFromUnicode(const QChar *uc, int len, QTextCodec::ConverterState *state)
+{
+ uchar replacement = '?';
+ int rlen = 3*len;
+ int surrogate_high = -1;
+ if (state) {
+ if (state->flags & QTextCodec::ConvertInvalidToNull)
+ replacement = 0;
+ if (!(state->flags & QTextCodec::IgnoreHeader))
+ rlen += 3;
+ if (state->remainingChars)
+ surrogate_high = state->state_data[0];
+ }
+
+ QByteArray rstr;
+ rstr.resize(rlen);
+ uchar* cursor = (uchar*)rstr.data();
+ const QChar *ch = uc;
+ int invalid = 0;
+ if (state && !(state->flags & QTextCodec::IgnoreHeader)) {
+ *cursor++ = 0xef;
+ *cursor++ = 0xbb;
+ *cursor++ = 0xbf;
+ }
+
+ const QChar *end = ch + len;
+ while (ch < end) {
+ uint u = ch->unicode();
+ if (surrogate_high >= 0) {
+ if (u >= 0xdc00 && u < 0xe000) {
+ u = (surrogate_high - 0xd800)*0x400 + (u - 0xdc00) + 0x10000;
+ surrogate_high = -1;
+ } else {
+ // high surrogate without low
+ *cursor = replacement;
+ ++ch;
+ ++invalid;
+ surrogate_high = -1;
+ continue;
+ }
+ } else if (u >= 0xdc00 && u < 0xe000) {
+ // low surrogate without high
+ *cursor = replacement;
+ ++ch;
+ ++invalid;
+ continue;
+ } else if (u >= 0xd800 && u < 0xdc00) {
+ surrogate_high = u;
+ ++ch;
+ continue;
+ }
+
+ if (u < 0x80) {
+ *cursor++ = (uchar)u;
+ } else {
+ if (u < 0x0800) {
+ *cursor++ = 0xc0 | ((uchar) (u >> 6));
+ } else {
+ // is it one of the Unicode non-characters?
+ if (isUnicodeNonCharacter(u)) {
+ *cursor++ = replacement;
+ ++ch;
+ ++invalid;
+ continue;
+ }
+
+ if (u > 0xffff) {
+ *cursor++ = 0xf0 | ((uchar) (u >> 18));
+ *cursor++ = 0x80 | (((uchar) (u >> 12)) & 0x3f);
+ } else {
+ *cursor++ = 0xe0 | (((uchar) (u >> 12)) & 0x3f);
+ }
+ *cursor++ = 0x80 | (((uchar) (u >> 6)) & 0x3f);
+ }
+ *cursor++ = 0x80 | ((uchar) (u&0x3f));
+ }
+ ++ch;
+ }
+
+ rstr.resize(cursor - (const uchar*)rstr.constData());
+ if (state) {
+ state->invalidChars += invalid;
+ state->flags |= QTextCodec::IgnoreHeader;
+ state->remainingChars = 0;
+ if (surrogate_high >= 0) {
+ state->remainingChars = 1;
+ state->state_data[0] = surrogate_high;
+ }
+ }
+ return rstr;
+}
+
+QString QUtf8::convertToUnicode(const char *chars, int len, QTextCodec::ConverterState *state)
+{
+ bool headerdone = false;
+ ushort replacement = QChar::ReplacementCharacter;
+ int need = 0;
+ int error = -1;
+ uint uc = 0;
+ uint min_uc = 0;
+ if (state) {
+ if (state->flags & QTextCodec::IgnoreHeader)
+ headerdone = true;
+ if (state->flags & QTextCodec::ConvertInvalidToNull)
+ replacement = QChar::Null;
+ need = state->remainingChars;
+ if (need) {
+ uc = state->state_data[0];
+ min_uc = state->state_data[1];
+ }
+ }
+ if (!headerdone && len > 3
+ && (uchar)chars[0] == 0xef && (uchar)chars[1] == 0xbb && (uchar)chars[2] == 0xbf) {
+ // starts with a byte order mark
+ chars += 3;
+ len -= 3;
+ headerdone = true;
+ }
+
+ QString result(need + len + 1, Qt::Uninitialized); // worst case
+ ushort *qch = (ushort *)result.unicode();
+ uchar ch;
+ int invalid = 0;
+
+ for (int i = 0; i < len; ++i) {
+ ch = chars[i];
+ if (need) {
+ if ((ch&0xc0) == 0x80) {
+ uc = (uc << 6) | (ch & 0x3f);
+ --need;
+ if (!need) {
+ // utf-8 bom composes into 0xfeff code point
+ bool nonCharacter;
+ if (!headerdone && uc == 0xfeff) {
+ // don't do anything, just skip the BOM
+ } else if (!(nonCharacter = isUnicodeNonCharacter(uc)) && uc > 0xffff && uc < 0x110000) {
+ // surrogate pair
+ Q_ASSERT((qch - (ushort*)result.unicode()) + 2 < result.length());
+ *qch++ = QChar::highSurrogate(uc);
+ *qch++ = QChar::lowSurrogate(uc);
+ } else if ((uc < min_uc) || (uc >= 0xd800 && uc <= 0xdfff) || nonCharacter || uc >= 0x110000) {
+ // error: overlong sequence, UTF16 surrogate or non-character
+ *qch++ = replacement;
+ ++invalid;
+ } else {
+ *qch++ = uc;
+ }
+ headerdone = true;
+ }
+ } else {
+ // error
+ i = error;
+ *qch++ = replacement;
+ ++invalid;
+ need = 0;
+ headerdone = true;
+ }
+ } else {
+ if (ch < 128) {
+ *qch++ = ushort(ch);
+ headerdone = true;
+ } else if ((ch & 0xe0) == 0xc0) {
+ uc = ch & 0x1f;
+ need = 1;
+ error = i;
+ min_uc = 0x80;
+ headerdone = true;
+ } else if ((ch & 0xf0) == 0xe0) {
+ uc = ch & 0x0f;
+ need = 2;
+ error = i;
+ min_uc = 0x800;
+ } else if ((ch&0xf8) == 0xf0) {
+ uc = ch & 0x07;
+ need = 3;
+ error = i;
+ min_uc = 0x10000;
+ headerdone = true;
+ } else {
+ // error
+ *qch++ = replacement;
+ ++invalid;
+ headerdone = true;
+ }
+ }
+ }
+ if (!state && need > 0) {
+ // unterminated UTF sequence
+ for (int i = error; i < len; ++i) {
+ *qch++ = replacement;
+ ++invalid;
+ }
+ }
+ result.truncate(qch - (ushort *)result.unicode());
+ if (state) {
+ state->invalidChars += invalid;
+ state->remainingChars = need;
+ if (headerdone)
+ state->flags |= QTextCodec::IgnoreHeader;
+ state->state_data[0] = need ? uc : 0;
+ state->state_data[1] = need ? min_uc : 0;
+ }
+ return result;
+}
+
+QByteArray QUtf16::convertFromUnicode(const QChar *uc, int len, QTextCodec::ConverterState *state, DataEndianness e)
+{
+ DataEndianness endian = e;
+ int length = 2*len;
+ if (!state || (!(state->flags & QTextCodec::IgnoreHeader))) {
+ length += 2;
+ }
+ if (e == DetectEndianness) {
+ endian = (QSysInfo::ByteOrder == QSysInfo::BigEndian) ? BigEndianness : LittleEndianness;
+ }
+
+ QByteArray d;
+ d.resize(length);
+ char *data = d.data();
+ if (!state || !(state->flags & QTextCodec::IgnoreHeader)) {
+ QChar bom(QChar::ByteOrderMark);
+ if (endian == BigEndianness) {
+ data[0] = bom.row();
+ data[1] = bom.cell();
+ } else {
+ data[0] = bom.cell();
+ data[1] = bom.row();
+ }
+ data += 2;
+ }
+ if (endian == BigEndianness) {
+ for (int i = 0; i < len; ++i) {
+ *(data++) = uc[i].row();
+ *(data++) = uc[i].cell();
+ }
+ } else {
+ for (int i = 0; i < len; ++i) {
+ *(data++) = uc[i].cell();
+ *(data++) = uc[i].row();
+ }
+ }
+
+ if (state) {
+ state->remainingChars = 0;
+ state->flags |= QTextCodec::IgnoreHeader;
+ }
+ return d;
+}
+
+QString QUtf16::convertToUnicode(const char *chars, int len, QTextCodec::ConverterState *state, DataEndianness e)
+{
+ DataEndianness endian = e;
+ bool half = false;
+ uchar buf = 0;
+ bool headerdone = false;
+ if (state) {
+ headerdone = state->flags & QTextCodec::IgnoreHeader;
+ if (endian == DetectEndianness)
+ endian = (DataEndianness)state->state_data[Endian];
+ if (state->remainingChars) {
+ half = true;
+ buf = state->state_data[Data];
+ }
+ }
+ if (headerdone && endian == DetectEndianness)
+ endian = (QSysInfo::ByteOrder == QSysInfo::BigEndian) ? BigEndianness : LittleEndianness;
+
+ QString result(len, Qt::Uninitialized); // worst case
+ QChar *qch = (QChar *)result.unicode();
+ while (len--) {
+ if (half) {
+ QChar ch;
+ if (endian == LittleEndianness) {
+ ch.setRow(*chars++);
+ ch.setCell(buf);
+ } else {
+ ch.setRow(buf);
+ ch.setCell(*chars++);
+ }
+ if (!headerdone) {
+ headerdone = true;
+ if (endian == DetectEndianness) {
+ if (ch == QChar::ByteOrderSwapped) {
+ endian = LittleEndianness;
+ } else if (ch == QChar::ByteOrderMark) {
+ endian = BigEndianness;
+ } else {
+ if (QSysInfo::ByteOrder == QSysInfo::BigEndian) {
+ endian = BigEndianness;
+ } else {
+ endian = LittleEndianness;
+ ch = QChar((ch.unicode() >> 8) | ((ch.unicode() & 0xff) << 8));
+ }
+ *qch++ = ch;
+ }
+ } else if (ch != QChar::ByteOrderMark) {
+ *qch++ = ch;
+ }
+ } else {
+ *qch++ = ch;
+ }
+ half = false;
+ } else {
+ buf = *chars++;
+ half = true;
+ }
+ }
+ result.truncate(qch - result.unicode());
+
+ if (state) {
+ if (headerdone)
+ state->flags |= QTextCodec::IgnoreHeader;
+ state->state_data[Endian] = endian;
+ if (half) {
+ state->remainingChars = 1;
+ state->state_data[Data] = buf;
+ } else {
+ state->remainingChars = 0;
+ state->state_data[Data] = 0;
+ }
+ }
+ return result;
+}
+
+QByteArray QUtf32::convertFromUnicode(const QChar *uc, int len, QTextCodec::ConverterState *state, DataEndianness e)
+{
+ DataEndianness endian = e;
+ int length = 4*len;
+ if (!state || (!(state->flags & QTextCodec::IgnoreHeader))) {
+ length += 4;
+ }
+ if (e == DetectEndianness) {
+ endian = (QSysInfo::ByteOrder == QSysInfo::BigEndian) ? BigEndianness : LittleEndianness;
+ }
+
+ QByteArray d(length, Qt::Uninitialized);
+ char *data = d.data();
+ if (!state || !(state->flags & QTextCodec::IgnoreHeader)) {
+ if (endian == BigEndianness) {
+ data[0] = 0;
+ data[1] = 0;
+ data[2] = (char)0xfe;
+ data[3] = (char)0xff;
+ } else {
+ data[0] = (char)0xff;
+ data[1] = (char)0xfe;
+ data[2] = 0;
+ data[3] = 0;
+ }
+ data += 4;
+ }
+ if (endian == BigEndianness) {
+ for (int i = 0; i < len; ++i) {
+ uint cp = uc[i].unicode();
+ if (uc[i].isHighSurrogate() && i < len - 1)
+ cp = QChar::surrogateToUcs4(cp, uc[++i].unicode());
+ *(data++) = cp >> 24;
+ *(data++) = (cp >> 16) & 0xff;
+ *(data++) = (cp >> 8) & 0xff;
+ *(data++) = cp & 0xff;
+ }
+ } else {
+ for (int i = 0; i < len; ++i) {
+ uint cp = uc[i].unicode();
+ if (uc[i].isHighSurrogate() && i < len - 1)
+ cp = QChar::surrogateToUcs4(cp, uc[++i].unicode());
+ *(data++) = cp & 0xff;
+ *(data++) = (cp >> 8) & 0xff;
+ *(data++) = (cp >> 16) & 0xff;
+ *(data++) = cp >> 24;
+ }
+ }
+
+ if (state) {
+ state->remainingChars = 0;
+ state->flags |= QTextCodec::IgnoreHeader;
+ }
+ return d;
+}
+
+QString QUtf32::convertToUnicode(const char *chars, int len, QTextCodec::ConverterState *state, DataEndianness e)
+{
+ DataEndianness endian = e;
+ uchar tuple[4];
+ int num = 0;
+ bool headerdone = false;
+ if (state) {
+ headerdone = state->flags & QTextCodec::IgnoreHeader;
+ if (endian == DetectEndianness) {
+ endian = (DataEndianness)state->state_data[Endian];
+ }
+ num = state->remainingChars;
+ memcpy(tuple, &state->state_data[Data], 4);
+ }
+ if (headerdone && endian == DetectEndianness)
+ endian = (QSysInfo::ByteOrder == QSysInfo::BigEndian) ? BigEndianness : LittleEndianness;
+
+ QString result;
+ result.resize((num + len) >> 2 << 1); // worst case
+ QChar *qch = (QChar *)result.unicode();
+
+ const char *end = chars + len;
+ while (chars < end) {
+ tuple[num++] = *chars++;
+ if (num == 4) {
+ if (!headerdone) {
+ if (endian == DetectEndianness) {
+ if (endian == DetectEndianness) {
+ if (tuple[0] == 0xff && tuple[1] == 0xfe && tuple[2] == 0 && tuple[3] == 0 && endian != BigEndianness) {
+ endian = LittleEndianness;
+ num = 0;
+ continue;
+ } else if (tuple[0] == 0 && tuple[1] == 0 && tuple[2] == 0xfe && tuple[3] == 0xff && endian != LittleEndianness) {
+ endian = BigEndianness;
+ num = 0;
+ continue;
+ } else if (QSysInfo::ByteOrder == QSysInfo::BigEndian) {
+ endian = BigEndianness;
+ } else {
+ endian = LittleEndianness;
+ }
+ }
+ } else if (((endian == BigEndianness) ? qFromBigEndian<quint32>(tuple) : qFromLittleEndian<quint32>(tuple)) == QChar::ByteOrderMark) {
+ num = 0;
+ continue;
+ }
+ }
+ uint code = (endian == BigEndianness) ? qFromBigEndian<quint32>(tuple) : qFromLittleEndian<quint32>(tuple);
+ if (code >= 0x10000) {
+ *qch++ = QChar::highSurrogate(code);
+ *qch++ = QChar::lowSurrogate(code);
+ } else {
+ *qch++ = code;
+ }
+ num = 0;
+ }
+ }
+ result.truncate(qch - result.unicode());
+
+ if (state) {
+ if (headerdone)
+ state->flags |= QTextCodec::IgnoreHeader;
+ state->state_data[Endian] = endian;
+ state->remainingChars = num;
+ memcpy(&state->state_data[Data], tuple, 4);
+ }
+ return result;
+}
+
+
+#ifndef QT_NO_TEXTCODEC
+
+QUtf8Codec::~QUtf8Codec()
+{
+}
+
+QByteArray QUtf8Codec::convertFromUnicode(const QChar *uc, int len, ConverterState *state) const
+{
+ return QUtf8::convertFromUnicode(uc, len, state);
+}
+
+void QUtf8Codec::convertToUnicode(QString *target, const char *chars, int len, ConverterState *state) const
+{
+ *target += QUtf8::convertToUnicode(chars, len, state);
+}
+
+QString QUtf8Codec::convertToUnicode(const char *chars, int len, ConverterState *state) const
+{
+ return QUtf8::convertToUnicode(chars, len, state);
+}
+
+QByteArray QUtf8Codec::name() const
+{
+ return "UTF-8";
+}
+
+int QUtf8Codec::mibEnum() const
+{
+ return 106;
+}
+
+QUtf16Codec::~QUtf16Codec()
+{
+}
+
+QByteArray QUtf16Codec::convertFromUnicode(const QChar *uc, int len, ConverterState *state) const
+{
+ return QUtf16::convertFromUnicode(uc, len, state, e);
+}
+
+QString QUtf16Codec::convertToUnicode(const char *chars, int len, ConverterState *state) const
+{
+ return QUtf16::convertToUnicode(chars, len, state, e);
+}
+
+int QUtf16Codec::mibEnum() const
+{
+ return 1015;
+}
+
+QByteArray QUtf16Codec::name() const
+{
+ return "UTF-16";
+}
+
+QList<QByteArray> QUtf16Codec::aliases() const
+{
+ return QList<QByteArray>();
+}
+
+int QUtf16BECodec::mibEnum() const
+{
+ return 1013;
+}
+
+QByteArray QUtf16BECodec::name() const
+{
+ return "UTF-16BE";
+}
+
+QList<QByteArray> QUtf16BECodec::aliases() const
+{
+ QList<QByteArray> list;
+ return list;
+}
+
+int QUtf16LECodec::mibEnum() const
+{
+ return 1014;
+}
+
+QByteArray QUtf16LECodec::name() const
+{
+ return "UTF-16LE";
+}
+
+QList<QByteArray> QUtf16LECodec::aliases() const
+{
+ QList<QByteArray> list;
+ return list;
+}
+
+QUtf32Codec::~QUtf32Codec()
+{
+}
+
+QByteArray QUtf32Codec::convertFromUnicode(const QChar *uc, int len, ConverterState *state) const
+{
+ return QUtf32::convertFromUnicode(uc, len, state, e);
+}
+
+QString QUtf32Codec::convertToUnicode(const char *chars, int len, ConverterState *state) const
+{
+ return QUtf32::convertToUnicode(chars, len, state, e);
+}
+
+int QUtf32Codec::mibEnum() const
+{
+ return 1017;
+}
+
+QByteArray QUtf32Codec::name() const
+{
+ return "UTF-32";
+}
+
+QList<QByteArray> QUtf32Codec::aliases() const
+{
+ QList<QByteArray> list;
+ return list;
+}
+
+int QUtf32BECodec::mibEnum() const
+{
+ return 1018;
+}
+
+QByteArray QUtf32BECodec::name() const
+{
+ return "UTF-32BE";
+}
+
+QList<QByteArray> QUtf32BECodec::aliases() const
+{
+ QList<QByteArray> list;
+ return list;
+}
+
+int QUtf32LECodec::mibEnum() const
+{
+ return 1019;
+}
+
+QByteArray QUtf32LECodec::name() const
+{
+ return "UTF-32LE";
+}
+
+QList<QByteArray> QUtf32LECodec::aliases() const
+{
+ QList<QByteArray> list;
+ return list;
+}
+
+#endif //QT_NO_TEXTCODEC
+
+QT_END_NAMESPACE