From 1dc5223a273a70fb9ca0e10d07c8de7a9da308bc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Val=C3=A9rio=20Val=C3=A9rio?= Date: Tue, 17 Mar 2015 12:35:34 +0200 Subject: Use QTextDocument to parse html Regular expression are not appropriated tool to parse a none regular language like html, a proper parse should be used. This commit introduces a dependency on QtGui making the messageserver binary marginally bigger in size. Usage of Html parse is optional can be defined via USE_HTML_PARSER compile flag. Change-Id: I2dba9042bb7f5340bfd8c24cb59c2a769489a7c6 Reviewed-by: Damien Caliste Reviewed-by: Matthew Vogt --- src/libraries/qmfclient/qmailmessage.cpp | 78 ++++++++++++++++++------------- src/libraries/qmfclient/qmfclient.pro | 5 ++ src/tools/messageserver/main.cpp | 10 ++++ src/tools/messageserver/messageserver.pro | 4 ++ 4 files changed, 65 insertions(+), 32 deletions(-) diff --git a/src/libraries/qmfclient/qmailmessage.cpp b/src/libraries/qmfclient/qmailmessage.cpp index 98108179..0ab3ca7a 100644 --- a/src/libraries/qmfclient/qmailmessage.cpp +++ b/src/libraries/qmfclient/qmailmessage.cpp @@ -54,6 +54,9 @@ #include #include #include +#ifdef USE_HTML_PARSER +#include +#endif #include #include @@ -8591,12 +8594,50 @@ static void setMessagePriorityFromHeaderFields(QMailMessage *mail) return; // Normal Priority } +static QString htmlToPlainText(const QString &html) +{ +#ifdef USE_HTML_PARSER + QTextDocument doc; + doc.setHtml(html); + return doc.toPlainText(); +#else + QString plainText = html; + plainText.remove(QRegExp(QLatin1String("<\\s*(style|head|form|script)[^<]*<\\s*/\\s*\\1\\s*>"), Qt::CaseInsensitive)); + plainText.remove(QRegExp(QLatin1String("<(.)[^>]*>"))); + plainText.replace(QLatin1String("""), QLatin1String("\""), Qt::CaseInsensitive); + plainText.replace(QLatin1String(" "), QLatin1String(" "), Qt::CaseInsensitive); + plainText.replace(QLatin1String("&"), QLatin1String("&"), Qt::CaseInsensitive); + plainText.replace(QLatin1String("<"), QLatin1String("<"), Qt::CaseInsensitive); + plainText.replace(QLatin1String(">"), QLatin1String(">"), Qt::CaseInsensitive); + + // now replace stuff like "м" + int pos = 0; + while (true) { + pos = plainText.indexOf(QLatin1String("&#"), pos); + if (pos < 0) + break; + int semicolon = plainText.indexOf(';', pos+2); + if (semicolon < 0) { + ++pos; + continue; + } + int code = (plainText.mid(pos+2, semicolon-pos-2)).toInt(); + if (code == 0) { + ++pos; + continue; + } + plainText.replace(pos, semicolon-pos+1, QChar(code)); + } + + return plainText.simplified(); +#endif +} + /*! \internal */ void QMailMessage::refreshPreview() { const int maxPreviewLength = 280; // TODO: don't load entire body into memory - // TODO: parse html correctly, e.g. closing brackets in quotes in tags QMailMessagePartContainer *htmlPart= findHtmlContainer(); QMailMessagePartContainer *plainTextPart= findPlainTextContainer(); @@ -8604,40 +8645,13 @@ void QMailMessage::refreshPreview() plainTextPart=0; if ( plainTextPart && plainTextPart->hasBody()) { - QString plaintext(plainTextPart->body().data()); - plaintext.remove(QRegExp(QLatin1String("\\[(image|cid):[^\\]]*\\]"), Qt::CaseInsensitive)); - metaDataImpl()->setPreview(plaintext.left(maxPreviewLength)); + QString plainText = plainTextPart->body().data(); + metaDataImpl()->setPreview(plainText.left(maxPreviewLength)); } else if (htmlPart && ( multipartType() == MultipartRelated || htmlPart->hasBody())) { QString markup = htmlPart->body().data(); - markup.remove(QRegExp(QLatin1String("<\\s*(style|head|form|script)[^<]*<\\s*/\\s*\\1\\s*>"), Qt::CaseInsensitive)); - markup.remove(QRegExp(QLatin1String("<(.)[^>]*>"))); - markup.replace(QLatin1String("""), QLatin1String("\""), Qt::CaseInsensitive); - markup.replace(QLatin1String(" "), QLatin1String(" "), Qt::CaseInsensitive); - markup.replace(QLatin1String("&"), QLatin1String("&"), Qt::CaseInsensitive); - markup.replace(QLatin1String("<"), QLatin1String("<"), Qt::CaseInsensitive); - markup.replace(QLatin1String(">"), QLatin1String(">"), Qt::CaseInsensitive); - - // now replace stuff like "м" - for (int pos = 0; ; ) { - pos = markup.indexOf(QLatin1String("&#"), pos); - if (pos < 0) - break; - int semicolon = markup.indexOf(';', pos+2); - if (semicolon < 0) { - ++pos; - continue; - } - int code = (markup.mid(pos+2, semicolon-pos-2)).toInt(); - if (code == 0) { - ++pos; - continue; - } - markup.replace(pos, semicolon-pos+1, QChar(code)); - } - - metaDataImpl()->setPreview(markup.simplified().left(maxPreviewLength)); + metaDataImpl()->setPreview(htmlToPlainText(markup).left(maxPreviewLength)); } - + partContainerImpl()->setPreviewDirty(false); } diff --git a/src/libraries/qmfclient/qmfclient.pro b/src/libraries/qmfclient/qmfclient.pro index f643c4d1..665aba1b 100644 --- a/src/libraries/qmfclient/qmfclient.pro +++ b/src/libraries/qmfclient/qmfclient.pro @@ -16,6 +16,11 @@ DEFINES += QMF_INSTALL_ROOT=\\\"$$QMF_INSTALL_ROOT\\\" #DEPENDPATH += . INCLUDEPATH += support +contains(DEFINES, USE_HTML_PARSER) { + QT += gui +} + + HEADERS += \ qmailaccount.h \ qmailaccountconfiguration.h \ diff --git a/src/tools/messageserver/main.cpp b/src/tools/messageserver/main.cpp index c6f0edc9..0076e711 100644 --- a/src/tools/messageserver/main.cpp +++ b/src/tools/messageserver/main.cpp @@ -37,6 +37,10 @@ #include #include #include +#include +#ifdef USE_HTML_PARSER +#include +#endif #if !defined(NO_SHUTDOWN_SIGNAL_HANDLING) && defined(Q_OS_UNIX) @@ -58,7 +62,13 @@ static void recreateLoggers(int n) int main(int argc, char** argv) { +#ifdef USE_HTML_PARSER + // Need for html parsing by in qmailmessage.cpp, but don't need real UI + setenv("QT_QPA_PLATFORM", "minimal", 1); + QGuiApplication app(argc, argv); +#else QCoreApplication app(argc, argv); +#endif // This is ~/.config/QtProject/Messageserver.conf qMailLoggersRecreate("QtProject", "Messageserver", "Msgsrv"); diff --git a/src/tools/messageserver/messageserver.pro b/src/tools/messageserver/messageserver.pro index a70edfe4..f7d92931 100644 --- a/src/tools/messageserver/messageserver.pro +++ b/src/tools/messageserver/messageserver.pro @@ -9,6 +9,10 @@ SERVER_AS_DLL: { TARGET = messageserver5 QT = core qmfclient qmfclient-private qmfmessageserver +contains(DEFINES, USE_HTML_PARSER) { + QT += gui +} + !contains(DEFINES,QMF_NO_WIDGETS) { QT += gui widgets } -- cgit v1.2.3