diff options
author | Karsten Heimrich <karsten.heimrich@qt.io> | 2020-06-17 12:33:40 +0200 |
---|---|---|
committer | Karsten Heimrich <karsten.heimrich@qt.io> | 2020-06-20 02:04:38 +0200 |
commit | 18ec53156ee704fdb4977436fccfdc85333e614b (patch) | |
tree | df734ce7893d570f4a90f923fcfe9c80ddc3de05 /src/corelib/codecs/qtextcodec.cpp | |
parent | 4455de24d453138411c57bae55af18f3fba58d4b (diff) |
Move QTextCodec support out of QtCore
* Assume UTF-8 on all Unix like systems
* Export some functions to be able to compile QTextCodec once
moved to Qt5Compat.
Task-number: QTBUG-75665
Change-Id: I52ec47a848bc0ba72e9c7689668b1bcc5d736c29
Reviewed-by: Lars Knoll <lars.knoll@qt.io>
Diffstat (limited to 'src/corelib/codecs/qtextcodec.cpp')
-rw-r--r-- | src/corelib/codecs/qtextcodec.cpp | 1299 |
1 files changed, 0 insertions, 1299 deletions
diff --git a/src/corelib/codecs/qtextcodec.cpp b/src/corelib/codecs/qtextcodec.cpp deleted file mode 100644 index be585d0407..0000000000 --- a/src/corelib/codecs/qtextcodec.cpp +++ /dev/null @@ -1,1299 +0,0 @@ -/**************************************************************************** -** -** Copyright (C) 2018 The Qt Company Ltd. -** Copyright (C) 2018 Intel Corporation. -** Contact: https://www.qt.io/licensing/ -** -** This file is part of the QtCore module of the Qt Toolkit. -** -** $QT_BEGIN_LICENSE:LGPL$ -** Commercial License Usage -** Licensees holding valid commercial Qt licenses may use this file in -** accordance with the commercial license agreement provided with the -** Software or, alternatively, in accordance with the terms contained in -** a written agreement between you and The Qt Company. For licensing terms -** and conditions see https://www.qt.io/terms-conditions. For further -** information use the contact form at https://www.qt.io/contact-us. -** -** GNU Lesser General Public License Usage -** Alternatively, this file may be used under the terms of the GNU Lesser -** General Public License version 3 as published by the Free Software -** Foundation and appearing in the file LICENSE.LGPL3 included in the -** packaging of this file. Please review the following information to -** ensure the GNU Lesser General Public License version 3 requirements -** will be met: https://www.gnu.org/licenses/lgpl-3.0.html. -** -** GNU General Public License Usage -** Alternatively, this file may be used under the terms of the GNU -** General Public License version 2.0 or (at your option) the GNU General -** Public license version 3 or any later version approved by the KDE Free -** Qt Foundation. The licenses are as published by the Free Software -** Foundation and appearing in the file LICENSE.GPL2 and LICENSE.GPL3 -** included in the packaging of this file. Please review the following -** information to ensure the GNU General Public License requirements will -** be met: https://www.gnu.org/licenses/gpl-2.0.html and -** https://www.gnu.org/licenses/gpl-3.0.html. -** -** $QT_END_LICENSE$ -** -****************************************************************************/ - -#include "qplatformdefs.h" - -#include "qtextcodec.h" -#include "qtextcodec_p.h" - -#include "qbytearraymatcher.h" -#include "qendian.h" -#include "qfile.h" -#include "qlist.h" -#include <private/qlocking_p.h> -#include "qstringlist.h" -#include "qvarlengtharray.h" -#if !defined(QT_BOOTSTRAPPED) -#include <private/qcoreapplication_p.h> -#endif - -#include "qutfcodec_p.h" -#include "qlatincodec_p.h" - -#if !defined(QT_BOOTSTRAPPED) -#if QT_CONFIG(codecs) -# include "qtsciicodec_p.h" -# include "qisciicodec_p.h" -#endif -#if QT_CONFIG(icu) -#include "qicucodec_p.h" -#else -#if QT_CONFIG(iconv) -# include "qiconvcodec_p.h" -#endif -#ifdef Q_OS_WIN -# include "qwindowscodec_p.h" -#endif -# include "qsimplecodec_p.h" -#if QT_CONFIG(big_codecs) -# ifndef Q_OS_INTEGRITY -# include "qgb18030codec_p.h" -# include "qeucjpcodec_p.h" -# include "qjiscodec_p.h" -# include "qsjiscodec_p.h" -# include "qeuckrcodec_p.h" -# include "qbig5codec_p.h" -# endif // !Q_OS_INTEGRITY -#endif // big_codecs - -#endif // icu -#endif // QT_BOOTSTRAPPED - -#include <mutex> - -#include <stdlib.h> -#include <ctype.h> -#include <locale.h> -#if defined (_XOPEN_UNIX) && !defined(Q_OS_QNX) && !defined(Q_OS_ANDROID) -# include <langinfo.h> -#endif - -QT_BEGIN_NAMESPACE - -typedef QList<QTextCodec*>::ConstIterator TextCodecListConstIt; -typedef QList<QByteArray>::ConstIterator ByteArrayListConstIt; - -Q_GLOBAL_STATIC(QRecursiveMutex, textCodecsMutex); - -Q_GLOBAL_STATIC(QTextCodecData, textCodecData) - -QTextCodecData::QTextCodecData() - : codecForLocale(nullptr) -{ -} - -QTextCodecData::~QTextCodecData() -{ - codecForLocale = nullptr; - QList<QTextCodec *> tmp = allCodecs; - allCodecs.clear(); - codecCache.clear(); - for (QList<QTextCodec *>::const_iterator it = tmp.constBegin(); it != tmp.constEnd(); ++it) - delete *it; -} - -QTextCodecData *QTextCodecData::instance() -{ - return textCodecData(); -} - -class TextCodecsMutexLocker -{ - using Lock = decltype(qt_unique_lock(std::declval<QRecursiveMutex&>())); - // ### FIXME: this is used when textCodecsMutex already == nullptr - const Lock lock = qt_unique_lock(textCodecsMutex()); -public: - TextCodecsMutexLocker() {} // required d/t an ICC 19 bug -}; - -#if !QT_CONFIG(icu) -static char qtolower(char c) -{ if (c >= 'A' && c <= 'Z') return c + 0x20; return c; } -static bool qisalnum(char c) -{ return (c >= '0' && c <= '9') || ((c | 0x20) >= 'a' && (c | 0x20) <= 'z'); } - -bool qTextCodecNameMatch(const char *n, const char *h) -{ - if (qstricmp(n, h) == 0) - return true; - - // if the letters and numbers are the same, we have a match - while (*n != '\0') { - if (qisalnum(*n)) { - for (;;) { - if (*h == '\0') - return false; - if (qisalnum(*h)) - break; - ++h; - } - if (qtolower(*n) != qtolower(*h)) - return false; - ++h; - } - ++n; - } - while (*h && !qisalnum(*h)) - ++h; - return (*h == '\0'); -} - - -#if !defined(Q_OS_WIN32) && !defined(QT_LOCALE_IS_UTF8) -static QTextCodec *checkForCodec(const QByteArray &name) { - QTextCodec *c = QTextCodec::codecForName(name); - if (!c) { - const int index = name.indexOf('@'); - if (index != -1) { - c = QTextCodec::codecForName(name.left(index)); - } - } - return c; -} -#endif - -static void setup(); - -// \threadsafe -// this returns the codec the method sets up as locale codec to -// avoid a race condition in codecForLocale() when -// setCodecForLocale(0) is called at the same time. -static QTextCodec *setupLocaleMapper() -{ - QTextCodecData *globalData = QTextCodecData::instance(); - - QTextCodec *locale = nullptr; - - { - const TextCodecsMutexLocker locker; - if (globalData->allCodecs.isEmpty()) - setup(); - } - -#if !defined(QT_BOOTSTRAPPED) - QCoreApplicationPrivate::initLocale(); -#endif - -#if defined(QT_LOCALE_IS_UTF8) - locale = QTextCodec::codecForName("UTF-8"); -#elif defined(Q_OS_WIN) - locale = QTextCodec::codecForName("System"); -#else - - // First try getting the codecs name from nl_langinfo and see - // if we have a builtin codec for it. - // Only fall back to using iconv if we can't find a builtin codec - // This is because the builtin utf8 codec is around 5 times faster - // then the using QIconvCodec - -#if defined (_XOPEN_UNIX) - char *charset = nl_langinfo(CODESET); - if (charset) - locale = QTextCodec::codecForName(charset); -#endif -#if QT_CONFIG(iconv) - if (!locale) { - // no builtin codec for the locale found, let's try using iconv - (void) new QIconvCodec(); - locale = QTextCodec::codecForName("System"); - } -#endif - - if (!locale) { - // Very poorly defined and followed standards causes lots of - // code to try to get all the cases... This logic is - // duplicated in QIconvCodec, so if you change it here, change - // it there too. - - // Try to determine locale codeset from locale name assigned to - // LC_CTYPE category. - - // First part is getting that locale name. First try setlocale() which - // definitely knows it, but since we cannot fully trust it, get ready - // to fall back to environment variables. - const QByteArray ctype = setlocale(LC_CTYPE, nullptr); - - // Get the first nonempty value from $LC_ALL, $LC_CTYPE, and $LANG - // environment variables. - QByteArray lang = qgetenv("LC_ALL"); - if (lang.isEmpty() || lang == "C") { - lang = qgetenv("LC_CTYPE"); - } - if (lang.isEmpty() || lang == "C") { - lang = qgetenv("LANG"); - } - - // Now try these in order: - // 1. CODESET from ctype if it contains a .CODESET part (e.g. en_US.ISO8859-15) - // 2. CODESET from lang if it contains a .CODESET part - // 3. ctype (maybe the locale is named "ISO-8859-1" or something) - // 4. locale (ditto) - // 5. check for "@euro" - // 6. guess locale from ctype unless ctype is "C" - // 7. guess locale from lang - - // 1. CODESET from ctype if it contains a .CODESET part (e.g. en_US.ISO8859-15) - int indexOfDot = ctype.indexOf('.'); - if (indexOfDot != -1) - locale = checkForCodec( ctype.mid(indexOfDot + 1) ); - - // 2. CODESET from lang if it contains a .CODESET part - if (!locale) { - indexOfDot = lang.indexOf('.'); - if (indexOfDot != -1) - locale = checkForCodec( lang.mid(indexOfDot + 1) ); - } - - // 3. ctype (maybe the locale is named "ISO-8859-1" or something) - if (!locale && !ctype.isEmpty() && ctype != "C") - locale = checkForCodec(ctype); - - // 4. locale (ditto) - if (!locale && !lang.isEmpty()) - locale = checkForCodec(lang); - - // 5. "@euro" - if ((!locale && ctype.contains("@euro")) || lang.contains("@euro")) - locale = checkForCodec("ISO 8859-15"); - } - -#endif - // If everything failed, we default to 8859-1 - if (!locale) - locale = QTextCodec::codecForName("ISO 8859-1"); - globalData->codecForLocale.storeRelease(locale); - return locale; -} - - -// textCodecsMutex need to be locked to enter this function -static void setup() -{ - static bool initialized = false; - if (initialized) - return; - initialized = true; - -#if QT_CONFIG(codecs) && !defined(QT_BOOTSTRAPPED) - (void)new QTsciiCodec; - for (int i = 0; i < 9; ++i) - (void)new QIsciiCodec(i); - for (int i = 0; i < QSimpleTextCodec::numSimpleCodecs; ++i) - (void)new QSimpleTextCodec(i); - -# if QT_CONFIG(big_codecs) && !defined(Q_OS_INTEGRITY) - (void)new QGb18030Codec; - (void)new QGbkCodec; - (void)new QGb2312Codec; - (void)new QEucJpCodec; - (void)new QJisCodec; - (void)new QSjisCodec; - (void)new QEucKrCodec; - (void)new QCP949Codec; - (void)new QBig5Codec; - (void)new QBig5hkscsCodec; -# endif // big_codecs && !Q_OS_INTEGRITY -#if QT_CONFIG(iconv) - (void) new QIconvCodec; -#endif -#if defined(Q_OS_WIN32) - (void) new QWindowsLocalCodec; -#endif // Q_OS_WIN32 -#endif // codecs && !QT_BOOTSTRAPPED - - (void)new QUtf16Codec; - (void)new QUtf16BECodec; - (void)new QUtf16LECodec; - (void)new QUtf32Codec; - (void)new QUtf32BECodec; - (void)new QUtf32LECodec; - (void)new QLatin15Codec; - (void)new QLatin1Codec; - (void)new QUtf8Codec; -} -#else -static void setup() {} -#endif // icu - -/*! - \enum QTextCodec::ConversionFlag - - \value DefaultConversion No flag is set. - \value ConvertInvalidToNull If this flag is set, each invalid input - character is output as a null character. - \value IgnoreHeader Ignore any Unicode byte-order mark and don't generate any. - - \omitvalue FreeFunction -*/ - -/*! - \fn QTextCodec::ConverterState::ConverterState(ConversionFlags flags) - - Constructs a ConverterState object initialized with the given \a flags. -*/ - -/*! - \class QTextCodec - \inmodule QtCore - \brief The QTextCodec class provides conversions between text encodings. - \reentrant - \ingroup i18n - - Qt uses Unicode to store, draw and manipulate strings. In many - situations you may wish to deal with data that uses a different - encoding. For example, most Japanese documents are still stored - in Shift-JIS or ISO 2022-JP, while Russian users often have their - documents in KOI8-R or Windows-1251. - - Qt provides a set of QTextCodec classes to help with converting - non-Unicode formats to and from Unicode. You can also create your - own codec classes. - - The supported encodings are: - - \list - \li \l{Big5 Text Codec}{Big5} - \li \l{Big5-HKSCS Text Codec}{Big5-HKSCS} - \li CP949 - \li \l{EUC-JP Text Codec}{EUC-JP} - \li \l{EUC-KR Text Codec}{EUC-KR} - \li \l{GBK Text Codec}{GB18030} - \li HP-ROMAN8 - \li IBM 850 - \li IBM 866 - \li IBM 874 - \li \l{ISO 2022-JP (JIS) Text Codec}{ISO 2022-JP} - \li ISO 8859-1 to 10 - \li ISO 8859-13 to 16 - \li Iscii-Bng, Dev, Gjr, Knd, Mlm, Ori, Pnj, Tlg, and Tml - \li KOI8-R - \li KOI8-U - \li Macintosh - \li \l{Shift-JIS Text Codec}{Shift-JIS} - \li TIS-620 - \li \l{TSCII Text Codec}{TSCII} - \li UTF-8 - \li UTF-16 - \li UTF-16BE - \li UTF-16LE - \li UTF-32 - \li UTF-32BE - \li UTF-32LE - \li Windows-1250 to 1258 - \endlist - - If Qt is compiled with ICU support enabled, most codecs supported by - ICU will also be available to the application. - - \l {QTextCodec}s can be used as follows to convert some locally encoded - string to Unicode. Suppose you have some string encoded in Russian - KOI8-R encoding, and want to convert it to Unicode. The simple way - to do it is like this: - - \snippet code/src_corelib_codecs_qtextcodec.cpp 0 - - After this, \c string holds the text converted to Unicode. - Converting a string from Unicode to the local encoding is just as - easy: - - \snippet code/src_corelib_codecs_qtextcodec.cpp 1 - - To read or write files in various encodings, use QTextStream and - its \l{QTextStream::setCodec()}{setCodec()} function. See the - \l{tools/codecs}{Codecs} example for an application of QTextCodec - to file I/O. - - Some care must be taken when trying to convert the data in chunks, - for example, when receiving it over a network. In such cases it is - possible that a multi-byte character will be split over two - chunks. At best this might result in the loss of a character and - at worst cause the entire conversion to fail. - - The approach to use in these situations is to create a QTextDecoder - object for the codec and use this QTextDecoder for the whole - decoding process, as shown below: - - \snippet code/src_corelib_codecs_qtextcodec.cpp 2 - - The QTextDecoder object maintains state between chunks and therefore - works correctly even if a multi-byte character is split between - chunks. - - \section1 Creating Your Own Codec Class - - Support for new text encodings can be added to Qt by creating - QTextCodec subclasses. - - The pure virtual functions describe the encoder to the system and - the coder is used as required in the different text file formats - supported by QTextStream, and under X11, for the locale-specific - character input and output. - - To add support for another encoding to Qt, make a subclass of - QTextCodec and implement the functions listed in the table below. - - \table - \header \li Function \li Description - - \row \li name() - \li Returns the official name for the encoding. If the - encoding is listed in the - \l{IANA character-sets encoding file}, the name - should be the preferred MIME name for the encoding. - - \row \li aliases() - \li Returns a list of alternative names for the encoding. - QTextCodec provides a default implementation that returns - an empty list. For example, "ISO-8859-1" has "latin1", - "CP819", "IBM819", and "iso-ir-100" as aliases. - - \row \li \l{QTextCodec::mibEnum()}{mibEnum()} - \li Return the MIB enum for the encoding if it is listed in - the \l{IANA character-sets encoding file}. - - \row \li convertToUnicode() - \li Converts an 8-bit character string to Unicode. - - \row \li convertFromUnicode() - \li Converts a Unicode string to an 8-bit character string. - \endtable - - \sa QTextStream, QTextDecoder, QTextEncoder, {Text Codecs Example} -*/ - -/*! - Constructs a QTextCodec, and gives it the highest precedence. The - QTextCodec should always be constructed on the heap (i.e. with \c - new). Qt takes ownership and will delete it when the application - terminates. -*/ -QTextCodec::QTextCodec() -{ - const TextCodecsMutexLocker locker; - - QTextCodecData *globalInstance = QTextCodecData::instance(); - if (globalInstance->allCodecs.isEmpty()) - setup(); - - globalInstance->allCodecs.prepend(this); -} - - -/*! - \nonreentrant - - Destroys the QTextCodec. Note that you should not delete codecs - yourself: once created they become Qt's responsibility. -*/ -QTextCodec::~QTextCodec() -{ - QTextCodecData *globalData = QTextCodecData::instance(); - if (!globalData) - return; - - globalData->codecForLocale.testAndSetRelaxed(this, nullptr); - - const TextCodecsMutexLocker locker; - - globalData->allCodecs.removeOne(this); - - auto it = globalData->codecCache.begin(); - - while (it != globalData->codecCache.end()) { - if (it.value() == this) - it = globalData->codecCache.erase(it); - else - ++it; - } -} - -/*! - \fn QTextCodec *QTextCodec::codecForName(const char *name) - - Searches all installed QTextCodec objects and returns the one - which best matches \a name; the match is case-insensitive. Returns - 0 if no codec matching the name \a name could be found. -*/ - -/*! - \threadsafe - Searches all installed QTextCodec objects and returns the one - which best matches \a name; the match is case-insensitive. Returns - 0 if no codec matching the name \a name could be found. -*/ -QTextCodec *QTextCodec::codecForName(const QByteArray &name) -{ - if (name.isEmpty()) - return nullptr; - - const TextCodecsMutexLocker locker; - - QTextCodecData *globalData = QTextCodecData::instance(); - if (!globalData) - return nullptr; - setup(); - -#if !QT_CONFIG(icu) - QTextCodecCache *cache = &globalData->codecCache; - QTextCodec *codec; - codec = cache->value(name); - if (codec) - return codec; - - for (TextCodecListConstIt it = globalData->allCodecs.constBegin(), cend = globalData->allCodecs.constEnd(); it != cend; ++it) { - QTextCodec *cursor = *it; - if (qTextCodecNameMatch(cursor->name(), name)) { - if (cache) - cache->insert(name, cursor); - return cursor; - } - QList<QByteArray> aliases = cursor->aliases(); - for (ByteArrayListConstIt ait = aliases.constBegin(), acend = aliases.constEnd(); ait != acend; ++ait) { - if (qTextCodecNameMatch(*ait, name)) { - cache->insert(name, cursor); - return cursor; - } - } - } - - return nullptr; -#else - return QIcuCodec::codecForNameUnlocked(name); -#endif -} - - -/*! - \threadsafe - Returns the QTextCodec which matches the - \l{QTextCodec::mibEnum()}{MIBenum} \a mib. -*/ -QTextCodec* QTextCodec::codecForMib(int mib) -{ - const TextCodecsMutexLocker locker; - - QTextCodecData *globalData = QTextCodecData::instance(); - if (!globalData) - return nullptr; - if (globalData->allCodecs.isEmpty()) - setup(); - - QByteArray key = "MIB: " + QByteArray::number(mib); - - QTextCodecCache *cache = &globalData->codecCache; - QTextCodec *codec; - if (cache) { - codec = cache->value(key); - if (codec) - return codec; - } - - for (TextCodecListConstIt it = globalData->allCodecs.constBegin(), cend = globalData->allCodecs.constEnd(); it != cend; ++it) { - QTextCodec *cursor = *it; - if (cursor->mibEnum() == mib) { - if (cache) - cache->insert(key, cursor); - return cursor; - } - } - -#if QT_CONFIG(icu) - return QIcuCodec::codecForMibUnlocked(mib); -#else - return nullptr; -#endif -} - -/*! - \threadsafe - Returns the list of all available codecs, by name. Call - QTextCodec::codecForName() to obtain the QTextCodec for the name. - - The list may contain many mentions of the same codec - if the codec has aliases. - - \sa availableMibs(), name(), aliases() -*/ -QList<QByteArray> QTextCodec::availableCodecs() -{ - const TextCodecsMutexLocker locker; - - QTextCodecData *globalData = QTextCodecData::instance(); - if (globalData->allCodecs.isEmpty()) - setup(); - - QList<QByteArray> codecs; - - for (TextCodecListConstIt it = globalData->allCodecs.constBegin(), cend = globalData->allCodecs.constEnd(); it != cend; ++it) { - codecs += (*it)->name(); - codecs += (*it)->aliases(); - } - -#if QT_CONFIG(icu) - codecs += QIcuCodec::availableCodecs(); -#endif - - return codecs; -} - -/*! - \threadsafe - Returns the list of MIBs for all available codecs. Call - QTextCodec::codecForMib() to obtain the QTextCodec for the MIB. - - \sa availableCodecs(), mibEnum() -*/ -QList<int> QTextCodec::availableMibs() -{ -#if QT_CONFIG(icu) - return QIcuCodec::availableMibs(); -#else - const TextCodecsMutexLocker locker; - - QTextCodecData *globalData = QTextCodecData::instance(); - if (globalData->allCodecs.isEmpty()) - setup(); - - QList<int> codecs; - - for (TextCodecListConstIt it = globalData->allCodecs.constBegin(), cend = globalData->allCodecs.constEnd(); it != cend; ++it) - codecs += (*it)->mibEnum(); - - return codecs; -#endif -} - -/*! - \nonreentrant - - Set the codec to \a c; this will be returned by - codecForLocale(). If \a c is \nullptr, the codec is reset to - the default. - - This might be needed for some applications that want to use their - own mechanism for setting the locale. - - \sa codecForLocale() -*/ -void QTextCodec::setCodecForLocale(QTextCodec *c) -{ - QTextCodecData::instance()->codecForLocale.storeRelease(c); -} - -/*! - \threadsafe - Returns a pointer to the codec most suitable for this locale. - - The codec will be retrieved from ICU where that backend is in use, otherwise - it may be obtained from an OS-specific API. In the latter case, the codec's - name may be "System". -*/ - -QTextCodec* QTextCodec::codecForLocale() -{ - QTextCodecData *globalData = QTextCodecData::instance(); - if (!globalData) - return nullptr; - - QTextCodec *codec = globalData->codecForLocale.loadAcquire(); - if (!codec) { -#if QT_CONFIG(icu) - const TextCodecsMutexLocker locker; - codec = QIcuCodec::defaultCodecUnlocked(); -#else - // setupLocaleMapper locks as necessary - codec = setupLocaleMapper(); -#endif - } - - return codec; -} - - -/*! - \fn QByteArray QTextCodec::name() const - - QTextCodec subclasses must reimplement this function. It returns - the name of the encoding supported by the subclass. - - If the codec is registered as a character set in the - \l{IANA character-sets encoding file} this method should - return the preferred mime name for the codec if defined, - otherwise its name. -*/ - -/*! - \fn int QTextCodec::mibEnum() const - - Subclasses of QTextCodec must reimplement this function. It - returns the \l{QTextCodec::mibEnum()}{MIBenum} (see \l{IANA character-sets encoding file} - for more information). It is important that each QTextCodec - subclass returns the correct unique value for this function. -*/ - -/*! - Subclasses can return a number of aliases for the codec in question. - - Standard aliases for codecs can be found in the - \l{IANA character-sets encoding file}. -*/ -QList<QByteArray> QTextCodec::aliases() const -{ - return QList<QByteArray>(); -} - -/*! - \fn QString QTextCodec::convertToUnicode(const char *chars, int len, - ConverterState *state) const - - QTextCodec subclasses must reimplement this function. - - Converts the first \a len characters of \a chars from the - encoding of the subclass to Unicode, and returns the result in a - QString. - - \a state can be \nullptr, in which case the conversion is stateless and - default conversion rules should be used. If state is not 0, the - codec should save the state after the conversion in \a state, and - adjust the \c remainingChars and \c invalidChars members of the struct. -*/ - -/*! - \fn QByteArray QTextCodec::convertFromUnicode(const QChar *input, int number, - ConverterState *state) const - - QTextCodec subclasses must reimplement this function. - - Converts the first \a number of characters from the \a input array - from Unicode to the encoding of the subclass, and returns the result - in a QByteArray. - - \a state can be \nullptr in which case the conversion is stateless and - default conversion rules should be used. If state is not 0, the - codec should save the state after the conversion in \a state, and - adjust the \c remainingChars and \c invalidChars members of the struct. -*/ - -/*! - Creates a QTextDecoder with a specified \a flags to decode chunks - of \c{char *} data to create chunks of Unicode data. - - The caller is responsible for deleting the returned object. - - \since 4.7 -*/ -QTextDecoder* QTextCodec::makeDecoder(QTextCodec::ConversionFlags flags) const -{ - return new QTextDecoder(this, flags); -} - -/*! - Creates a QTextEncoder with a specified \a flags to encode chunks - of Unicode data as \c{char *} data. - - The caller is responsible for deleting the returned object. - - \since 4.7 -*/ -QTextEncoder* QTextCodec::makeEncoder(QTextCodec::ConversionFlags flags) const -{ - return new QTextEncoder(this, flags); -} - -/*! - \fn QByteArray QTextCodec::fromUnicode(const QChar *input, int number, - ConverterState *state) const - - Converts the first \a number of characters from the \a input array - from Unicode to the encoding of this codec, and returns the result - in a QByteArray. - - The \a state of the convertor used is updated. -*/ - -#if QT_STRINGVIEW_LEVEL < 2 -/*! - Converts \a str from Unicode to the encoding of this codec, and - returns the result in a QByteArray. -*/ -QByteArray QTextCodec::fromUnicode(const QString& str) const -{ - return convertFromUnicode(str.constData(), str.length(), nullptr); -} -#endif - -/*! - \overload - \since 5.10 - - Converts \a str from Unicode to the encoding of this codec, and - returns the result in a QByteArray. -*/ -QByteArray QTextCodec::fromUnicode(QStringView str) const -{ - return convertFromUnicode(str.data(), str.length(), nullptr); -} - -/*! - \fn QString QTextCodec::toUnicode(const char *input, int size, - ConverterState *state) const - - Converts the first \a size characters from the \a input from the - encoding of this codec to Unicode, and returns the result in a - QString. - - The \a state of the convertor used is updated. -*/ - -/*! - Converts \a a from the encoding of this codec to Unicode, and - returns the result in a QString. -*/ -QString QTextCodec::toUnicode(const QByteArray& a) const -{ - return convertToUnicode(a.constData(), a.length(), nullptr); -} - -/*! - Returns \c true if the Unicode character \a ch can be fully encoded - with this codec; otherwise returns \c false. -*/ -bool QTextCodec::canEncode(QChar ch) const -{ - ConverterState state; - state.flags = ConvertInvalidToNull; - convertFromUnicode(&ch, 1, &state); - return (state.invalidChars == 0); -} - -#if QT_STRINGVIEW_LEVEL < 2 -/*! - \overload - - \a s contains the string being tested for encode-ability. -*/ -bool QTextCodec::canEncode(const QString& s) const -{ - ConverterState state; - state.flags = ConvertInvalidToNull; - convertFromUnicode(s.constData(), s.length(), &state); - return (state.invalidChars == 0); -} -#endif - -/*! - \overload - \since 5.10 - - Returns \c true if the Unicode string \a s can be fully encoded - with this codec; otherwise returns \c false. -*/ -bool QTextCodec::canEncode(QStringView s) const -{ - ConverterState state; - state.flags = ConvertInvalidToNull; - convertFromUnicode(s.data(), s.length(), &state); - return !state.invalidChars; -} -/*! - \overload - - \a chars contains the source characters. -*/ -QString QTextCodec::toUnicode(const char *chars) const -{ - int len = qstrlen(chars); - return convertToUnicode(chars, len, nullptr); -} - - -/*! - \class QTextEncoder - \inmodule QtCore - \brief The QTextEncoder class provides a state-based encoder. - \reentrant - \ingroup i18n - - A text encoder converts text from Unicode into an encoded text format - using a specific codec. - - The encoder converts Unicode into another format, remembering any - state that is required between calls. - - \sa QTextCodec::makeEncoder(), QTextDecoder -*/ - -/*! - \fn QTextEncoder::QTextEncoder(const QTextCodec *codec) - - Constructs a text encoder for the given \a codec. -*/ - -/*! - Constructs a text encoder for the given \a codec and conversion \a flags. - - \since 4.7 -*/ -QTextEncoder::QTextEncoder(const QTextCodec *codec, QTextCodec::ConversionFlags flags) - : c(codec), state() -{ - state.flags = flags; -} - -/*! - Destroys the encoder. -*/ -QTextEncoder::~QTextEncoder() -{ -} - -/*! - \internal - \since 4.5 - Determines whether the eecoder encountered a failure while decoding the input. If - an error was encountered, the produced result is undefined, and gets converted as according - to the conversion flags. - */ -bool QTextEncoder::hasFailure() const -{ - return state.invalidChars != 0; -} - -#if QT_STRINGVIEW_LEVEL < 2 -/*! - Converts the Unicode string \a str into an encoded QByteArray. -*/ -QByteArray QTextEncoder::fromUnicode(const QString& str) -{ - QByteArray result = c->fromUnicode(str.constData(), str.length(), &state); - return result; -} -#endif - -/*! - \overload - \since 5.10 - Converts the Unicode string \a str into an encoded QByteArray. -*/ -QByteArray QTextEncoder::fromUnicode(QStringView str) -{ - return c->fromUnicode(str.data(), str.length(), &state); -} - -/*! - \overload - - Converts \a len characters (not bytes) from \a uc, and returns the - result in a QByteArray. -*/ -QByteArray QTextEncoder::fromUnicode(const QChar *uc, int len) -{ - QByteArray result = c->fromUnicode(uc, len, &state); - return result; -} - -/*! - \class QTextDecoder - \inmodule QtCore - \brief The QTextDecoder class provides a state-based decoder. - \reentrant - \ingroup i18n - - A text decoder converts text from an encoded text format into Unicode - using a specific codec. - - The decoder converts text in this format into Unicode, remembering any - state that is required between calls. - - \sa QTextCodec::makeDecoder(), QTextEncoder -*/ - -/*! - \fn QTextDecoder::QTextDecoder(const QTextCodec *codec) - - Constructs a text decoder for the given \a codec. -*/ - -/*! - Constructs a text decoder for the given \a codec and conversion \a flags. - - \since 4.7 -*/ - -QTextDecoder::QTextDecoder(const QTextCodec *codec, QTextCodec::ConversionFlags flags) - : c(codec), state() -{ - state.flags = flags; -} - -/*! - Destroys the decoder. -*/ -QTextDecoder::~QTextDecoder() -{ -} - -/*! - \fn QString QTextDecoder::toUnicode(const char *chars, int len) - - Converts the first \a len bytes in \a chars to Unicode, returning - the result. - - If not all characters are used (e.g. if only part of a multi-byte - encoding is at the end of the characters), the decoder remembers - enough state to continue with the next call to this function. -*/ -QString QTextDecoder::toUnicode(const char *chars, int len) -{ - return c->toUnicode(chars, len, &state); -} - -// in qstring.cpp: -void qt_from_latin1(char16_t *dst, const char *str, size_t size) noexcept; - -/*! \overload - - The converted string is returned in \a target. - */ -void QTextDecoder::toUnicode(QString *target, const char *chars, int len) -{ - Q_ASSERT(target); - switch (c->mibEnum()) { - case 106: // utf8 - static_cast<const QUtf8Codec*>(c)->convertToUnicode(target, chars, len, &state); - break; - case 4: // latin1 - target->resize(len); - qt_from_latin1((char16_t*)target->data(), chars, len); - break; - default: - *target = c->toUnicode(chars, len, &state); - } -} - - -/*! - \overload - - Converts the bytes in the byte array specified by \a ba to Unicode - and returns the result. -*/ -QString QTextDecoder::toUnicode(const QByteArray &ba) -{ - return c->toUnicode(ba.constData(), ba.length(), &state); -} - -/*! - \since 4.4 - - Tries to detect the encoding of the provided snippet of HTML in - the given byte array, \a ba, by checking the BOM (Byte Order Mark) - and the content-type meta header and returns a QTextCodec instance - that is capable of decoding the html to unicode. If the codec - cannot be detected from the content provided, \a defaultCodec is - returned. - - \sa codecForUtfText() -*/ -QTextCodec *QTextCodec::codecForHtml(const QByteArray &ba, QTextCodec *defaultCodec) -{ - // determine charset - QTextCodec *c = QTextCodec::codecForUtfText(ba, nullptr); - if (!c) { - static Q_RELAXED_CONSTEXPR auto matcher = qMakeStaticByteArrayMatcher("meta "); - QByteArray header = ba.left(1024).toLower(); - int pos = matcher.indexIn(header); - if (pos != -1) { - static Q_RELAXED_CONSTEXPR auto matcher = qMakeStaticByteArrayMatcher("charset="); - pos = matcher.indexIn(header, pos); - if (pos != -1) { - pos += qstrlen("charset="); - - int pos2 = pos; - // The attribute can be closed with either """, "'", ">" or "/", - // none of which are valid charset characters. - while (++pos2 < header.size()) { - char ch = header.at(pos2); - if (ch == '\"' || ch == '\'' || ch == '>') { - QByteArray name = header.mid(pos, pos2 - pos); - if (name == "unicode") // QTBUG-41998, ICU will return UTF-16. - name = QByteArrayLiteral("UTF-8"); - c = QTextCodec::codecForName(name); - return c ? c : defaultCodec; - } - } - } - } - } - if (!c) - c = defaultCodec; - - return c; -} - -/*! - \overload - - Tries to detect the encoding of the provided snippet of HTML in - the given byte array, \a ba, by checking the BOM (Byte Order Mark) - and the content-type meta header and returns a QTextCodec instance - that is capable of decoding the html to unicode. If the codec cannot - be detected, this overload returns a Latin-1 QTextCodec. -*/ -QTextCodec *QTextCodec::codecForHtml(const QByteArray &ba) -{ - return codecForHtml(ba, QTextCodec::codecForName("ISO-8859-1")); -} - -/*! - \since 4.6 - - Tries to detect the encoding of the provided snippet \a ba by - using the BOM (Byte Order Mark) and returns a QTextCodec instance - that is capable of decoding the text to unicode. This function can - detect one of the following codecs: - - \list - \li UTF-32 Little Endian - \li UTF-32 Big Endian - \li UTF-16 Little Endian - \li UTF-16 Big Endian - \li UTF-8 - \endlist - - If the codec cannot be detected from the content provided, \a defaultCodec - is returned. - - \sa codecForHtml() -*/ -QTextCodec *QTextCodec::codecForUtfText(const QByteArray &ba, QTextCodec *defaultCodec) -{ - const int arraySize = ba.size(); - const uchar *buf = reinterpret_cast<const uchar *>(ba.constData()); - const uint bom = 0xfeff; - - if (arraySize > 3) { - uint uc = qFromUnaligned<uint>(buf); - if (uc == qToBigEndian(bom)) - return QTextCodec::codecForMib(1018); // utf-32 be - else if (uc == qToLittleEndian(bom)) - return QTextCodec::codecForMib(1019); // utf-32 le - } - - if (arraySize < 2) - return defaultCodec; - - ushort uc = qFromUnaligned<ushort>(buf); - if (uc == qToBigEndian(ushort(bom))) - return QTextCodec::codecForMib(1013); // utf16 be - else if (uc == qToLittleEndian(ushort(bom))) - return QTextCodec::codecForMib(1014); // utf16 le - - if (arraySize < 3) - return defaultCodec; - - static const char utf8bom[] = "\xef\xbb\xbf"; - if (memcmp(buf, utf8bom, sizeof(utf8bom) - 1) == 0) - return QTextCodec::codecForMib(106); // utf-8 - - return defaultCodec; -} - -/*! - \overload - - Tries to detect the encoding of the provided snippet \a ba by - using the BOM (Byte Order Mark) and returns a QTextCodec instance - that is capable of decoding the text to unicode. This function can - detect one of the following codecs: - - \list - \li UTF-32 Little Endian - \li UTF-32 Big Endian - \li UTF-16 Little Endian - \li UTF-16 Big Endian - \li UTF-8 - \endlist - - If the codec cannot be detected from the content provided, this overload - returns a Latin-1 QTextCodec. - - \sa codecForHtml() -*/ -QTextCodec *QTextCodec::codecForUtfText(const QByteArray &ba) -{ - return codecForUtfText(ba, QTextCodec::codecForMib(/*Latin 1*/ 4)); -} - -/*! - \fn QTextCodec * QTextCodec::codecForTr () - \obsolete - - Returns the codec used by QObject::tr() on its argument. If this - function returns \nullptr (the default), tr() assumes Latin-1. -*/ - -/*! - \internal - \since 4.3 - Determines whether the decoder encountered a failure while decoding the - input. If an error was encountered, the produced result is undefined, and - gets converted as according to the conversion flags. - */ -bool QTextDecoder::hasFailure() const -{ - return state.invalidChars != 0; -} - -/*! - \internal - \since 5.12 - - Determines whether the decoder needs more bytes to continue decoding. That - is, this signifies that the input string ended in the middle of a - multi-byte sequence. Note that it's possible some codecs do not report this. - */ -bool QTextDecoder::needsMoreData() const -{ - return state.remainingChars; -} - -/*! - \fn QTextCodec *Qt::codecForHtml(const QByteArray &ba) - \internal - - This function is defined in the \c <QTextCodec> header file. -*/ -QTextCodec *Qt::codecForHtml(const QByteArray &ba) -{ - return QTextCodec::codecForHtml(ba); -} - -QT_END_NAMESPACE |