Remove QTextCodec dependency in the old SAX parser

Just so we can get this cleaned up as well and remove it from Qt Core. Change-Id: I2b5b821b039ce2c024ec3cb7338a1a9becdd2157 Reviewed-by: Thiago Macieira <thiago.macieira@intel.com>
author: Lars Knoll <lars.knoll@qt.io> 2020-04-30 15:06:27 +0200
committer: Lars Knoll <lars.knoll@qt.io> 2020-05-14 07:51:32 +0200
commit: 1a6bf6c549bcad78f7918dc53c48e5db4fdf4c3d (patch)
tree: f0aef1e52795bb4b0d11a6ea69c690909c6e9cd7 /src/xml/sax/qxml.cpp
parent: b88720eb82056f5435f48dff728572126c04ca6a (diff)
1 files changed, 26 insertions, 64 deletions
diff --git a/src/xml/sax/qxml.cpp b/src/xml/sax/qxml.cpp
index 1898f4fbe8..d1af7e42da 100644
--- a/src/xml/sax/qxml.cpp
+++ b/src/xml/sax/qxml.cpp
@@ -244,9 +244,7 @@ public:
     int pos;
     int length;
     bool nextReturnedEndOfData;
-#if QT_CONFIG(textcodec)
-    QTextDecoder *encMapper;
-#endif
+    QStringDecoder toUnicode;
 
     QByteArray encodingDeclBytes;
     QString encodingDeclChars;
@@ -1090,9 +1088,6 @@ void QXmlInputSource::init()
         d->inputStream = nullptr;
 
         setData(QString());
-#if QT_CONFIG(textcodec)
-        d->encMapper = nullptr;
-#endif
         d->nextReturnedEndOfData = true; // first call to next() will call fetchData()
 
         d->encodingDeclBytes.clear();
@@ -1136,9 +1131,6 @@ QXmlInputSource::QXmlInputSource(QIODevice *dev)
 QXmlInputSource::~QXmlInputSource()
 {
     // ### close the input device.
-#if QT_CONFIG(textcodec)
-    delete d->encMapper;
-#endif
     delete d;
 }
 
@@ -1356,77 +1348,47 @@ QString QXmlInputSource::fromRawData(const QByteArray &data, bool beginning)
 {
     if (data.size() == 0)
         return QString();
-    if (beginning) {
-        delete d->encMapper;
-        d->encMapper = nullptr;
-    }
 
-    int mib = 106; // UTF-8
+    if (beginning)
+        d->toUnicode = QStringDecoder();
 
     // This is the initial UTF codec we will read the encoding declaration with
-    if (d->encMapper == nullptr) {
+    if (!d->toUnicode.isValid()) {
         d->encodingDeclBytes.clear();
         d->encodingDeclChars.clear();
         d->lookingForEncodingDecl = true;
 
-        // look for byte order mark and read the first 5 characters
-        if (data.size() >= 4) {
-            uchar ch1 = data.at(0);
-            uchar ch2 = data.at(1);
-            uchar ch3 = data.at(2);
-            uchar ch4 = data.at(3);
-
-            if ((ch1 == 0 && ch2 == 0 && ch3 == 0xfe && ch4 == 0xff) ||
-                (ch1 == 0xff && ch2 == 0xfe && ch3 == 0 && ch4 == 0))
-                mib = 1017; // UTF-32 with byte order mark
-            else if (ch1 == 0x3c && ch2 == 0x00 && ch3 == 0x00 && ch4 == 0x00)
-                mib = 1019; // UTF-32LE
-            else if (ch1 == 0x00 && ch2 == 0x00 && ch3 == 0x00 && ch4 == 0x3c)
-                mib = 1018; // UTF-32BE
-        }
-        if (mib == 106 && data.size() >= 2) {
-            uchar ch1 = data.at(0);
-            uchar ch2 = data.at(1);
-
-            if ((ch1 == 0xfe && ch2 == 0xff) || (ch1 == 0xff && ch2 == 0xfe))
-                mib = 1015; // UTF-16 with byte order mark
-            else if (ch1 == 0x3c && ch2 == 0x00)
-                mib = 1014; // UTF-16LE
-            else if (ch1 == 0x00 && ch2 == 0x3c)
-                mib = 1013; // UTF-16BE
-        }
-
-        QTextCodec *codec = QTextCodec::codecForMib(mib);
-        Q_ASSERT(codec);
-
-        d->encMapper = codec->makeDecoder();
+        auto encoding = QStringConverter::encodingForData(data.constData(), data.size(), char16_t('<'));
+        if (encoding) {
+            d->lookingForEncodingDecl = false;
+            d->toUnicode = QStringDecoder(*encoding);
+        } else {
+            d->toUnicode = QStringDecoder(QStringDecoder::Utf8);
+        }
     }
 
-    QString input = d->encMapper->toUnicode(data.constData(), data.size());
+    QString input = d->toUnicode(data.constData(), data.size());
 
     if (d->lookingForEncodingDecl) {
         d->encodingDeclChars += input;
 
         bool needMoreText;
-        QString encoding = extractEncodingDecl(d->encodingDeclChars, &needMoreText);
+        QByteArray encoding = extractEncodingDecl(d->encodingDeclChars, &needMoreText).toLatin1();
 
         if (!encoding.isEmpty()) {
-            if (QTextCodec *codec = QTextCodec::codecForName(std::move(encoding).toLatin1())) {
-                /* If the encoding is the same, we don't have to do toUnicode() all over again. */
-                if(codec->mibEnum() != mib) {
-                    delete d->encMapper;
-                    d->encMapper = codec->makeDecoder();
-
-                    /* The variable input can potentially be large, so we deallocate
-                     * it before calling toUnicode() in order to avoid having two
-                     * large QStrings in memory simultaneously. */
-                    input.clear();
-
-                    // prime the decoder with the data so far
-                    d->encMapper->toUnicode(d->encodingDeclBytes.constData(), d->encodingDeclBytes.size());
-                    // now feed it the new data
-                    input = d->encMapper->toUnicode(data.constData(), data.size());
-                }
+            auto e = QStringDecoder::encodingForData(encoding.constData(), encoding.size());
+            if (e && *e != QStringDecoder::Utf8) {
+                d->toUnicode = QStringDecoder(*e);
+
+                /* The variable input can potentially be large, so we deallocate
+                 * it before calling toUnicode() in order to avoid having two
+                 * large QStrings in memory simultaneously. */
+                input.clear();
+
+                // prime the decoder with the data so far
+                d->toUnicode(d->encodingDeclBytes.constData(), d->encodingDeclBytes.size());
+                // now feed it the new data
+                input = d->toUnicode(data.constData(), data.size());
             }
         }
author	Lars Knoll <lars.knoll@qt.io>	2020-04-30 15:06:27 +0200
committer	Lars Knoll <lars.knoll@qt.io>	2020-05-14 07:51:32 +0200
commit	1a6bf6c549bcad78f7918dc53c48e5db4fdf4c3d (patch)
tree	f0aef1e52795bb4b0d11a6ea69c690909c6e9cd7 /src/xml/sax/qxml.cpp
parent	b88720eb82056f5435f48dff728572126c04ca6a (diff)