diff options
author | Lars Knoll <lars.knoll@nokia.com> | 2012-07-21 00:31:37 +0200 |
---|---|---|
committer | Qt by Nokia <qt-info@nokia.com> | 2012-07-31 11:12:28 +0200 |
commit | 88d2e92b39ffd4a6ea9446498ad5a1cb208022a6 (patch) | |
tree | d64944f988a79bc8cbc556f6191e15bbe59b9cfc /src/corelib/codecs/qicucodec.cpp | |
parent | 865a9465f36edaf773b7836ee005ca96502dfca9 (diff) |
ICU code page conversion support
Use ICU to do code page conversion instead of the
builtin text codecs. With this QTextCodec simply
becomes a wrapper around ICU's ucnv_* methods.
We only keep our own codecs for UTF-*, ISO-8859-1,
ISO-8859-15 for performance reasons, and for TSCII
and iscii-* because they aren't supported by ICU.
Change-Id: I4fc49eba55cf772b9772c6dac606a47a44346a60
Reviewed-by: Thiago Macieira <thiago.macieira@intel.com>
Diffstat (limited to 'src/corelib/codecs/qicucodec.cpp')
-rw-r--r-- | src/corelib/codecs/qicucodec.cpp | 625 |
1 files changed, 625 insertions, 0 deletions
diff --git a/src/corelib/codecs/qicucodec.cpp b/src/corelib/codecs/qicucodec.cpp new file mode 100644 index 0000000000..c91115b3b5 --- /dev/null +++ b/src/corelib/codecs/qicucodec.cpp @@ -0,0 +1,625 @@ +/**************************************************************************** +** +** Copyright (C) 2012 Nokia Corporation and/or its subsidiary(-ies). +** Contact: http://www.qt-project.org/ +** +** This file is part of the QtCore module of the Qt Toolkit. +** +** $QT_BEGIN_LICENSE:LGPL$ +** GNU Lesser General Public License Usage +** This file may be used under the terms of the GNU Lesser General Public +** License version 2.1 as published by the Free Software Foundation and +** appearing in the file LICENSE.LGPL included in the packaging of this +** file. Please review the following information to ensure the GNU Lesser +** General Public License version 2.1 requirements will be met: +** http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html. +** +** In addition, as a special exception, Nokia gives you certain additional +** rights. These rights are described in the Nokia Qt LGPL Exception +** version 1.1, included in the file LGPL_EXCEPTION.txt in this package. +** +** GNU General Public License Usage +** Alternatively, this file may be used under the terms of the GNU General +** Public License version 3.0 as published by the Free Software Foundation +** and appearing in the file LICENSE.GPL included in the packaging of this +** file. Please review the following information to ensure the GNU General +** Public License version 3.0 requirements will be met: +** http://www.gnu.org/copyleft/gpl.html. +** +** Other Usage +** Alternatively, this file may be used in accordance with the terms and +** conditions contained in a signed written agreement between you and Nokia. +** +** +** +** +** +** +** $QT_END_LICENSE$ +** +****************************************************************************/ + +#include "qicucodec_p.h" +#include "qtextcodec_p.h" +#include "private/qcoreglobaldata_p.h" +#include "qdebug.h" + +#include "unicode/ucnv.h" + +QT_BEGIN_NAMESPACE + +extern QMutex *qTextCodecsMutex(); + +static void qIcuCodecStateFree(QTextCodec::ConverterState *state) +{ + ucnv_close(static_cast<UConverter *>(state->d)); +} + +/* The list below is generated from http://www.iana.org/assignments/character-sets/ + using the snippet of code below: + +#include <QtCore> +#include <unicode/ucnv.h> + +int main(int argc, char **argv) +{ + QCoreApplication app(argc, argv); + + QFile file("character-sets.txt"); + file.open(QFile::ReadOnly); + QByteArray name; + int mib = -1; + QByteArray nameList; + int pos = 0; + while (!file.atEnd()) { + QByteArray s = file.readLine().trimmed(); + if (s.isEmpty()) { + if (mib != -1) { + UErrorCode error = U_ZERO_ERROR; + const char *standard_name = ucnv_getStandardName(name, "MIME", &error); + if (U_FAILURE(error) || !standard_name) { + error = U_ZERO_ERROR; + standard_name = ucnv_getStandardName(name, "IANA", &error); + } + UConverter *conv = ucnv_open(standard_name, &error); + if (!U_FAILURE(error) && conv && standard_name) { + ucnv_close(conv); + printf(" { %d, %d },\n", mib, pos); + nameList += "\""; + nameList += standard_name; + nameList += "\\0\"\n"; + pos += strlen(standard_name) + 1; + } + } + name = QByteArray(); + mib = -1; + } + if (s.startsWith("Name: ")) { + name = s.mid(5).trimmed(); + if (name.indexOf(' ') > 0) + name = name.left(name.indexOf(' ')); + } + if (s.startsWith("MIBenum:")) + mib = s.mid(8).trimmed().toInt(); + if (s.startsWith("Alias:") && s.contains("MIME")) { + name = s.mid(6).trimmed(); + name = name.left(name.indexOf(' ')).trimmed(); + } + } + qDebug() << nameList; +} +*/ + +struct MibToName { + short mib; + short index; +}; + +static MibToName mibToName[] = { + { 3, 0 }, + { 4, 9 }, + { 5, 20 }, + { 6, 31 }, + { 7, 42 }, + { 8, 53 }, + { 9, 64 }, + { 10, 75 }, + { 11, 86 }, + { 12, 97 }, + { 13, 108 }, + { 16, 120 }, + { 17, 134 }, + { 18, 144 }, + { 30, 151 }, + { 36, 160 }, + { 37, 167 }, + { 38, 179 }, + { 39, 186 }, + { 40, 198 }, + { 57, 212 }, + { 81, 223 }, + { 82, 234 }, + { 84, 245 }, + { 85, 256 }, + { 104, 267 }, + { 105, 279 }, + { 106, 295 }, + { 109, 301 }, + { 110, 313 }, + { 111, 325 }, + { 113, 337 }, + { 114, 341 }, + { 1000, 349 }, + { 1001, 356 }, + { 1011, 363 }, + { 1012, 368 }, + { 1013, 374 }, + { 1014, 383 }, + { 1015, 392 }, + { 1016, 399 }, + { 1017, 406 }, + { 1018, 413 }, + { 1019, 422 }, + { 1020, 431 }, + { 2004, 438 }, + { 2005, 448 }, + { 2009, 472 }, + { 2013, 479 }, + { 2016, 486 }, + { 2024, 495 }, + { 2025, 505 }, + { 2026, 512 }, + { 2027, 517 }, + { 2028, 527 }, + { 2030, 534 }, + { 2033, 541 }, + { 2034, 548 }, + { 2035, 555 }, + { 2037, 562 }, + { 2038, 569 }, + { 2039, 576 }, + { 2040, 583 }, + { 2041, 590 }, + { 2043, 597 }, + { 2011, 604 }, + { 2044, 611 }, + { 2045, 618 }, + { 2010, 624 }, + { 2046, 631 }, + { 2047, 638 }, + { 2048, 645 }, + { 2049, 652 }, + { 2050, 659 }, + { 2051, 666 }, + { 2052, 673 }, + { 2053, 680 }, + { 2054, 687 }, + { 2055, 694 }, + { 2056, 701 }, + { 2062, 708 }, + { 2063, 715 }, + { 2084, 723 }, + { 2085, 730 }, + { 2086, 741 }, + { 2087, 748 }, + { 2088, 755 }, + { 2089, 762 }, + { 2091, 771 }, + { 2092, 780 }, + { 2093, 789 }, + { 2094, 798 }, + { 2095, 807 }, + { 2096, 816 }, + { 2097, 825 }, + { 2098, 834 }, + { 2099, 843 }, + { 2100, 852 }, + { 2101, 861 }, + { 2102, 872 }, + { 2250, 880 }, + { 2251, 893 }, + { 2252, 906 }, + { 2253, 919 }, + { 2254, 932 }, + { 2255, 945 }, + { 2256, 958 }, + { 2257, 971 }, + { 2258, 984 }, + { 2259, 997 }, +}; +int mibToNameSize = sizeof(mibToName)/sizeof(MibToName); + +static const char mibToNameTable[] = + "US-ASCII\0" + "ISO-8859-1\0" + "ISO-8859-2\0" + "ISO-8859-3\0" + "ISO-8859-4\0" + "ISO-8859-5\0" + "ISO-8859-6\0" + "ISO-8859-7\0" + "ISO-8859-8\0" + "ISO-8859-9\0" + "ISO-8859-10\0" + "ISO-2022-JP-1\0" + "Shift_JIS\0" + "EUC-JP\0" + "US-ASCII\0" + "EUC-KR\0" + "ISO-2022-KR\0" + "EUC-KR\0" + "ISO-2022-JP\0" + "ISO-2022-JP-2\0" + "GB_2312-80\0" + "ISO-8859-6\0" + "ISO-8859-6\0" + "ISO-8859-8\0" + "ISO-8859-8\0" + "ISO-2022-CN\0" + "ISO-2022-CN-EXT\0" + "UTF-8\0" + "ISO-8859-13\0" + "ISO-8859-14\0" + "ISO-8859-15\0" + "GBK\0" + "GB18030\0" + "UTF-16\0" + "UTF-32\0" + "SCSU\0" + "UTF-7\0" + "UTF-16BE\0" + "UTF-16LE\0" + "UTF-16\0" + "CESU-8\0" + "UTF-32\0" + "UTF-32BE\0" + "UTF-32LE\0" + "BOCU-1\0" + "hp-roman8\0" + "Adobe-Standard-Encoding\0" + "IBM850\0" + "IBM862\0" + "IBM-Thai\0" + "Shift_JIS\0" + "GB2312\0" + "Big5\0" + "macintosh\0" + "IBM037\0" + "IBM273\0" + "IBM277\0" + "IBM278\0" + "IBM280\0" + "IBM284\0" + "IBM285\0" + "IBM290\0" + "IBM297\0" + "IBM420\0" + "IBM424\0" + "IBM437\0" + "IBM500\0" + "cp851\0" + "IBM852\0" + "IBM855\0" + "IBM857\0" + "IBM860\0" + "IBM861\0" + "IBM863\0" + "IBM864\0" + "IBM865\0" + "IBM868\0" + "IBM869\0" + "IBM870\0" + "IBM871\0" + "IBM918\0" + "IBM1026\0" + "KOI8-R\0" + "HZ-GB-2312\0" + "IBM866\0" + "IBM775\0" + "KOI8-U\0" + "IBM00858\0" + "IBM01140\0" + "IBM01141\0" + "IBM01142\0" + "IBM01143\0" + "IBM01144\0" + "IBM01145\0" + "IBM01146\0" + "IBM01147\0" + "IBM01148\0" + "IBM01149\0" + "Big5-HKSCS\0" + "IBM1047\0" + "windows-1250\0" + "windows-1251\0" + "windows-1252\0" + "windows-1253\0" + "windows-1254\0" + "windows-1255\0" + "windows-1256\0" + "windows-1257\0" + "windows-1258\0" + "TIS-620\0"; + +QList<QByteArray> QIcuCodec::availableCodecs() +{ + QList<QByteArray> codecs; + int n = ucnv_countAvailable(); + for (int i = 0; i < n; ++i) { + const char *name = ucnv_getAvailableName(i); + + UErrorCode error = U_ZERO_ERROR; + const char *standardName = ucnv_getStandardName(name, "MIME", &error); + if (U_FAILURE(error) || !standardName) { + error = U_ZERO_ERROR; + standardName = ucnv_getStandardName(name, "IANA", &error); + } + if (U_FAILURE(error)) + continue; + + error = U_ZERO_ERROR; + int ac = ucnv_countAliases(standardName, &error); + if (U_FAILURE(error)) + continue; + for (int j = 0; j < ac; ++j) { + error = U_ZERO_ERROR; + const char *alias = ucnv_getAlias(standardName, j, &error); + if (!U_SUCCESS(error)) + continue; + codecs += alias; + } + } + + // handled by Qt and not in ICU: + codecs += "TSCII"; + codecs += "System"; + + return codecs; +} + +QList<int> QIcuCodec::availableMibs() +{ + QList<int> mibs; + for (int i = 0; i < mibToNameSize; ++i) + mibs += mibToName[i].mib; + + // handled by Qt and not in ICU: + mibs += 2107; // TSCII + + return mibs; +} + +static inline bool nameMatch(const QByteArray &a, const char *b) +{ return a == b; } + +QTextCodec *QIcuCodec::codecForName(const char *name) +{ + // backwards compatibility with Qt 4.x + if (!qstrcmp(name, "CP949")) + name = "windows-949"; + // this one is broken data in ICU 4.4, and can't be resolved even though it's an alias to tis-620 + if (!qstrcmp(name, "windows-874-2000")) + name = "TIS-620"; + + UErrorCode error = U_ZERO_ERROR; + // MIME gives better default names + const char *standardName = ucnv_getStandardName(name, "MIME", &error); + if (U_FAILURE(error) || !standardName) { + error = U_ZERO_ERROR; + standardName = ucnv_getStandardName(name, "IANA", &error); + } + bool qt_only = false; + if (U_FAILURE(error) || !standardName) { + standardName = name; + qt_only = true; + } else { + // correct some issues where the ICU data set contains duplicated entries. + // Where this happens it's because one data set is a subset of another. We + // always use the larger data set. + + if (qstrcmp(standardName, "GB2312") == 0 || qstrcmp(standardName, "GB_2312-80") == 0) + standardName = "GBK"; + else if (qstrcmp(standardName, "KSC_5601") == 0 || qstrcmp(standardName, "EUC-KR") == 0 || qstrcmp(standardName, "cp1363") == 0) + standardName = "windows-949"; + } + + QCoreGlobalData *globalData = QCoreGlobalData::instance(); + QTextCodecCache *cache = &globalData->codecCache; + + QTextCodec *codec; + if (cache) { + codec = cache->value(standardName); + if (codec) + return codec; + } + + for (int i = 0; i < globalData->allCodecs.size(); ++i) { + QTextCodec *cursor = globalData->allCodecs.at(i); + if (nameMatch(cursor->name(), standardName)) { + if (cache) + cache->insert(standardName, cursor); + return cursor; + } + QList<QByteArray> aliases = cursor->aliases(); + for (int y = 0; y < aliases.size(); ++y) + if (nameMatch(aliases.at(y), standardName)) { + if (cache) + cache->insert(standardName, cursor); + return cursor; + } + } + + if (qt_only) + return 0; + + // check whether there is really a converter for the name available. + UConverter *conv = ucnv_open(standardName, &error); + if (!conv) { + qDebug() << "codecForName: ucnv_open failed" << standardName << u_errorName(error); + return 0; + } + //qDebug() << "QIcuCodec: Standard name for " << name << "is" << standardName; + ucnv_close(conv); + + + QTextCodec *c = new QIcuCodec(standardName); + if (cache) + cache->insert(standardName, c); + return c; +} + + +QTextCodec *QIcuCodec::codecForMib(int mib) +{ + for (int i = 0; i < mibToNameSize; ++i) { + if (mibToName[i].mib == mib) + return codecForName(mibToNameTable + mibToName[i].index); + } + return 0; +} + + +QIcuCodec::QIcuCodec(const char *name) + : m_name(name) +{ +} + +QIcuCodec::~QIcuCodec() +{ +} + +UConverter *QIcuCodec::getConverter(QTextCodec::ConverterState *state) const +{ + UConverter *conv = 0; + if (state) { + if (!state->d) { + // first time + state->flags |= QTextCodec::FreeFunction; + QTextCodecUnalignedPointer::encode(state->state_data, qIcuCodecStateFree); + UErrorCode error = U_ZERO_ERROR; + state->d = ucnv_open(m_name, &error); + ucnv_setSubstChars(static_cast<UConverter *>(state->d), + state->flags & QTextCodec::ConvertInvalidToNull ? "\0" : "?", 1, &error); + if (U_FAILURE(error)) + qDebug() << "getConverter(state) ucnv_open failed" << m_name << u_errorName(error); + } + conv = static_cast<UConverter *>(state->d); + } + if (!conv) { + // stateless conversion + UErrorCode error = U_ZERO_ERROR; + conv = ucnv_open(m_name, &error); + ucnv_setSubstChars(conv, "?", 1, &error); + if (U_FAILURE(error)) + qDebug() << "getConverter(no state) ucnv_open failed" << m_name << u_errorName(error); + } + return conv; +} + +QString QIcuCodec::convertToUnicode(const char *chars, int length, QTextCodec::ConverterState *state) const +{ + UConverter *conv = getConverter(state); + + QString string(length + 2, Qt::Uninitialized); + + const char *end = chars + length; + int convertedChars = 0; + while (1) { + UChar *uc = (UChar *)string.data(); + UChar *ucEnd = uc + string.length(); + uc += convertedChars; + UErrorCode error = U_ZERO_ERROR; + ucnv_toUnicode(conv, + &uc, ucEnd, + &chars, end, + 0, false, &error); + if (!U_SUCCESS(error) && error != U_BUFFER_OVERFLOW_ERROR) { + qDebug() << "convertToUnicode failed:" << u_errorName(error); + break; + } + + convertedChars = uc - (UChar *)string.data(); + if (chars >= end) + break; + string.resize(string.length()*2); + } + string.resize(convertedChars); + + if (!state) + ucnv_close(conv); + return string; +} + + +QByteArray QIcuCodec::convertFromUnicode(const QChar *unicode, int length, QTextCodec::ConverterState *state) const +{ + UConverter *conv = getConverter(state); + + int requiredLength = UCNV_GET_MAX_BYTES_FOR_STRING(length, ucnv_getMaxCharSize(conv)); + QByteArray string(requiredLength, Qt::Uninitialized); + + const UChar *uc = (const UChar *)unicode; + const UChar *end = uc + length; + int convertedChars = 0; + while (1) { + char *ch = (char *)string.data(); + char *chEnd = ch + string.length(); + ch += convertedChars; + UErrorCode error = U_ZERO_ERROR; + ucnv_fromUnicode(conv, + &ch, chEnd, + &uc, end, + 0, false, &error); + if (!U_SUCCESS(error)) + qDebug() << "convertFromUnicode failed:" << u_errorName(error); + convertedChars = ch - string.data(); + if (uc >= end) + break; + string.resize(string.length()*2); + } + string.resize(convertedChars); + + if (!state) + ucnv_close(conv); + + return string; +} + + +QByteArray QIcuCodec::name() const +{ + return m_name; +} + + +QList<QByteArray> QIcuCodec::aliases() const +{ + UErrorCode error = U_ZERO_ERROR; + + int n = ucnv_countAliases(m_name, &error); + + QList<QByteArray> aliases; + for (int i = 0; i < n; ++i) { + const char *a = ucnv_getAlias(m_name, i, &error); + // skip the canonical name + if (!a || !qstrcmp(a, m_name)) + continue; + aliases += a; + } + + return aliases; +} + + +int QIcuCodec::mibEnum() const +{ + for (int i = 0; i < mibToNameSize; ++i) { + if (m_name == (mibToNameTable + mibToName[i].index)) + return mibToName[i].mib; + } + + return 0; +} + +QT_END_NAMESPACE + |