1 files changed, 777 insertions, 0 deletions
diff --git a/src/linguist/lupdate/python.cpp b/src/linguist/lupdate/python.cpp
new file mode 100644
index 000000000..0bc3bf5e8
--- /dev/null
+++ b/src/linguist/lupdate/python.cpp
@@ -0,0 +1,777 @@
+// Copyright (C) 2002-2007 Detlev Offenbach <detlev@die-offenbachs.de>
+// Copyright (C) 2021 The Qt Company Ltd.
+// SPDX-License-Identifier: LicenseRef-Qt-Commercial OR GPL-3.0-only WITH Qt-GPL-exception-1.0
+
+#include <translator.h>
+#include "lupdate.h"
+
+#include <QtCore/qhash.h>
+#include <QtCore/qstring.h>
+#include <QtCore/qtextstream.h>
+#include <QtCore/qstack.h>
+
+#include <cctype>
+#include <cerrno>
+#include <cstdio>
+#include <cstring>
+
+QT_BEGIN_NAMESPACE
+
+static const char PythonMagicComment[] = "TRANSLATOR ";
+
+/*
+  The first part of this source file is the Python tokenizer.  We skip
+  most of Python; the only tokens that interest us are defined here.
+*/
+
+enum Token { Tok_Eof, Tok_class, Tok_def, Tok_return, Tok_tr,
+             Tok_trUtf8, Tok_translate, Tok_Ident,
+             Tok_Comment, Tok_Dot, Tok_String,
+             Tok_LeftParen, Tok_RightParen,
+             Tok_Comma, Tok_None, Tok_Integer};
+
+enum class StringType
+{
+    NoString,
+    String,
+    FormatString,
+    RawString
+};
+
+/*
+  The tokenizer maintains the following global variables. The names
+  should be self-explanatory.
+*/
+static QString yyFileName;
+static int yyCh;
+static QByteArray yyIdent;
+static char yyComment[65536];
+static size_t yyCommentLen;
+static char yyString[65536];
+static size_t yyStringLen;
+static int yyParenDepth;
+static int yyLineNo;
+static int yyCurLineNo;
+
+static QByteArray extraComment;
+static QByteArray id;
+
+QHash<QByteArray, Token> tokens = {
+    {"None", Tok_None},
+    {"class", Tok_class},
+    {"def", Tok_def},
+    {"return", Tok_return},
+    {"__tr", Tok_tr}, // Legacy?
+    {"__trUtf8", Tok_trUtf8}
+};
+
+// the file to read from (if reading from a file)
+static FILE *yyInFile;
+
+// the string to read from and current position in the string (otherwise)
+static int yyInPos;
+static int buf;
+
+static int (*getChar)();
+static int (*peekChar)();
+
+static int yyIndentationSize;
+static int yyContinuousSpaceCount;
+static bool yyCountingIndentation;
+
+// (Context, indentation level) pair.
+using ContextPair = QPair<QByteArray, int>;
+// Stack of (Context, indentation level) pairs.
+using ContextStack = QStack<ContextPair>;
+static ContextStack yyContextStack;
+
+static int getCharFromFile()
+{
+    int c;
+
+    if (buf < 0) {
+        c = getc(yyInFile);
+    } else {
+        c = buf;
+        buf = -1;
+    }
+    if (c == '\n') {
+        yyCurLineNo++;
+        yyCountingIndentation = true;
+        yyContinuousSpaceCount = 0;
+    } else if (yyCountingIndentation && (c == 32 || c == 9)) {
+        yyContinuousSpaceCount++;
+    } else {
+        yyCountingIndentation = false;
+    }
+    return c;
+}
+
+static int peekCharFromFile()
+{
+    int c = getc(yyInFile);
+    buf = c;
+    return c;
+}
+
+static void startTokenizer(const QString &fileName, int (*getCharFunc)(),
+                           int (*peekCharFunc)())
+{
+    yyInPos = 0;
+    buf = -1;
+    getChar = getCharFunc;
+    peekChar = peekCharFunc;
+
+    yyFileName = fileName;
+    yyCh = getChar();
+    yyParenDepth = 0;
+    yyCurLineNo = 1;
+
+    yyIndentationSize = -1;
+    yyContinuousSpaceCount = 0;
+    yyCountingIndentation = false;
+    yyContextStack.clear();
+}
+
+static bool parseStringEscape(int quoteChar, StringType stringType)
+{
+    static const char tab[] = "abfnrtv";
+    static const char backTab[] = "\a\b\f\n\r\t\v";
+
+    yyCh = getChar();
+    if (yyCh == EOF)
+        return false;
+
+    if (stringType == StringType::RawString) {
+        if (yyCh != quoteChar) // Only quotes can be escaped in raw strings
+            yyString[yyStringLen++] = '\\';
+        yyString[yyStringLen++] = yyCh;
+        yyCh = getChar();
+        return true;
+    }
+
+    if (yyCh == 'x') {
+        QByteArray hex = "0";
+        yyCh = getChar();
+        if (yyCh == EOF)
+            return false;
+        while (std::isxdigit(yyCh)) {
+            hex += char(yyCh);
+            yyCh = getChar();
+            if (yyCh == EOF)
+                return false;
+        }
+        uint n;
+#ifdef Q_CC_MSVC
+        sscanf_s(hex, "%x", &n);
+#else
+        std::sscanf(hex, "%x", &n);
+#endif
+        if (yyStringLen < sizeof(yyString) - 1)
+            yyString[yyStringLen++] = char(n);
+         return true;
+    }
+
+    if (yyCh >= '0' && yyCh < '8') {
+         QByteArray oct;
+         int n = 0;
+         do {
+            oct += char(yyCh);
+            ++n;
+            yyCh = getChar();
+            if (yyCh == EOF)
+                return false;
+         } while (yyCh >= '0' && yyCh < '8' && n < 3);
+#ifdef Q_CC_MSVC
+         sscanf_s(oct, "%o", &n);
+#else
+         std::sscanf(oct, "%o", &n);
+#endif
+         if (yyStringLen < sizeof(yyString) - 1)
+            yyString[yyStringLen++] = char(n);
+         return true;
+    }
+
+    const char *p = std::strchr(tab, yyCh);
+    if (yyStringLen < sizeof(yyString) - 1) {
+         yyString[yyStringLen++] = p == nullptr
+                                   ? char(yyCh) : backTab[p - tab];
+    }
+    yyCh = getChar();
+    return true;
+}
+
+static Token parseString(StringType stringType = StringType::NoString)
+{
+    int quoteChar = yyCh;
+    bool tripleQuote = false;
+    bool singleQuote = true;
+    bool in = false;
+
+    yyCh = getChar();
+
+    while (yyCh != EOF) {
+        if (singleQuote && (yyCh == '\n' || (in && yyCh == quoteChar)))
+            break;
+
+        if (yyCh == quoteChar) {
+            if (peekChar() == quoteChar) {
+                yyCh = getChar();
+                if (!tripleQuote) {
+                    tripleQuote = true;
+                    singleQuote = false;
+                    in = true;
+                    yyCh = getChar();
+                } else {
+                    yyCh = getChar();
+                    if (yyCh == quoteChar) {
+                        tripleQuote = false;
+                        break;
+                    }
+                }
+            } else if (tripleQuote) {
+                if (yyStringLen < sizeof(yyString) - 1)
+                    yyString[yyStringLen++] = char(yyCh);
+                yyCh = getChar();
+                continue;
+            } else {
+                break;
+            }
+        } else {
+            in = true;
+        }
+
+        if (yyCh == '\\') {
+            if (!parseStringEscape(quoteChar, stringType))
+                return Tok_Eof;
+        } else {
+            char *yStart = yyString + yyStringLen;
+            char *yp = yStart;
+            while (yyCh != EOF && (tripleQuote || yyCh != '\n') && yyCh != quoteChar
+                   && yyCh != '\\') {
+                *yp++ = char(yyCh);
+                yyCh = getChar();
+            }
+            yyStringLen += yp - yStart;
+        }
+    }
+    yyString[yyStringLen] = '\0';
+
+    if (yyCh != quoteChar) {
+        printf("%c\n", yyCh);
+
+        qWarning("%s:%d: Unterminated string",
+                 qPrintable(yyFileName), yyLineNo);
+    }
+
+    if (yyCh == EOF)
+        return Tok_Eof;
+    yyCh = getChar();
+    return Tok_String;
+}
+
+static QByteArray readLine()
+{
+    QByteArray result;
+    while (true) {
+        yyCh = getChar();
+        if (yyCh == EOF || yyCh == '\n')
+            break;
+        result.append(char(yyCh));
+    }
+    return result;
+}
+
+static Token getToken(StringType stringType = StringType::NoString)
+{
+    yyIdent.clear();
+    yyCommentLen = 0;
+    yyStringLen = 0;
+    while (yyCh != EOF) {
+        yyLineNo = yyCurLineNo;
+
+        if (std::isalpha(yyCh) || yyCh == '_') {
+            do {
+                yyIdent.append(char(yyCh));
+                yyCh = getChar();
+            } while (std::isalnum(yyCh) || yyCh == '_');
+
+            return tokens.value(yyIdent, Tok_Ident);
+        }
+        switch (yyCh) {
+        case '#':
+            switch (getChar()) {
+            case ':':
+                extraComment = readLine().trimmed();
+                break;
+            case '=':
+                id = readLine().trimmed();
+                break;
+            case EOF:
+                return Tok_Eof;
+            case '\n':
+                break;
+            default:
+                do {
+                    yyCh = getChar();
+                } while (yyCh != EOF && yyCh != '\n');
+                break;
+            }
+            break;
+        case '"':
+        case '\'':
+            return parseString(stringType);
+        case '(':
+            yyParenDepth++;
+            yyCh = getChar();
+            return Tok_LeftParen;
+        case ')':
+            yyParenDepth--;
+            yyCh = getChar();
+            return Tok_RightParen;
+        case ',':
+            yyCh = getChar();
+            return Tok_Comma;
+        case '.':
+            yyCh = getChar();
+            return Tok_Dot;
+        case '0':
+        case '1':
+        case '2':
+        case '3':
+        case '4':
+        case '5':
+        case '6':
+        case '7':
+        case '8':
+        case '9':  {
+            QByteArray ba;
+            ba += char(yyCh);
+            yyCh = getChar();
+            const bool hex = yyCh == 'x';
+            if (hex) {
+                ba += char(yyCh);
+                yyCh = getChar();
+            }
+            while ((hex ? std::isxdigit(yyCh) : std::isdigit(yyCh))) {
+                ba += char(yyCh);
+                yyCh = getChar();
+            }
+            bool ok;
+            auto v = ba.toLongLong(&ok);
+            Q_UNUSED(v);
+            if (ok)
+                return Tok_Integer;
+            break;
+        }
+        default:
+            yyCh = getChar();
+        }
+    }
+    return Tok_Eof;
+}
+
+/*
+  The second part of this source file is the parser. It accomplishes
+  a very easy task: It finds all strings inside a tr() or translate()
+  call, and possibly finds out the context of the call. It supports
+  three cases:
+  (1) the context is specified, as in FunnyDialog.tr("Hello") or
+     translate("FunnyDialog", "Hello");
+  (2) the call appears within an inlined function;
+  (3) the call appears within a function defined outside the class definition.
+*/
+
+static Token yyTok;
+
+static bool match(Token t)
+{
+    const bool matches = (yyTok == t);
+    if (matches)
+        yyTok = getToken();
+    return matches;
+}
+
+static bool matchStringStart()
+{
+    if (yyTok == Tok_String)
+        return true;
+    // Check for f"bla{var}" and raw strings r"bla".
+    if (yyTok == Tok_Ident && yyIdent.size() == 1) {
+        switch (yyIdent.at(0)) {
+        case 'r':
+            yyTok = getToken(StringType::RawString);
+            return yyTok == Tok_String;
+        case 'f':
+            yyTok = getToken(StringType::FormatString);
+            return yyTok == Tok_String;
+        }
+    }
+    return false;
+}
+
+static bool matchString(QByteArray *s)
+{
+    s->clear();
+    bool ok = false;
+    while (matchStringStart()) {
+        *s += yyString;
+        yyTok = getToken();
+        ok = true;
+    }
+    return ok;
+}
+
+static bool matchEncoding(bool *utf8)
+{
+    // Remove any leading module paths.
+    if (yyTok == Tok_Ident && std::strcmp(yyIdent, "PySide6") == 0) {
+        yyTok = getToken();
+
+        if (yyTok != Tok_Dot)
+            return false;
+
+        yyTok = getToken();
+    }
+
+    if (yyTok == Tok_Ident && (std::strcmp(yyIdent, "QtGui") == 0
+                               || std::strcmp(yyIdent, "QtCore") == 0)) {
+        yyTok = getToken();
+
+        if (yyTok != Tok_Dot)
+            return false;
+
+        yyTok = getToken();
+    }
+
+    if (yyTok == Tok_Ident) {
+        if (std::strcmp(yyIdent, "QApplication") == 0
+            || std::strcmp(yyIdent, "QGuiApplication") == 0
+            || std::strcmp(yyIdent, "QCoreApplication") == 0) {
+            yyTok = getToken();
+
+            if (yyTok == Tok_Dot)
+                yyTok = getToken();
+        }
+
+        *utf8 = QByteArray(yyIdent).endsWith("UTF8");
+        yyTok = getToken();
+        return true;
+    }
+    return false;
+}
+
+static bool matchStringOrNone(QByteArray *s)
+{
+    bool matches = matchString(s);
+
+    if (!matches)
+        matches = match(Tok_None);
+
+    return matches;
+}
+
+/*
+ * match any expression that can return a number, which can be
+ * 1. Literal number (e.g. '11')
+ * 2. simple identifier (e.g. 'm_count')
+ * 3. simple function call (e.g. 'size()')
+ * 4. function call on an object (e.g. 'list.size()')
+ *
+ * Other cases:
+ * size(2,4)
+ * list().size()
+ * list(a,b).size(2,4)
+ * etc...
+ */
+static bool matchExpression()
+{
+    if (match(Tok_Integer))
+        return true;
+
+    int parenlevel = 0;
+    while (match(Tok_Ident) || parenlevel > 0) {
+        if (yyTok == Tok_RightParen) {
+            if (parenlevel == 0)
+                break;
+            --parenlevel;
+            yyTok = getToken();
+        } else if (yyTok == Tok_LeftParen) {
+            yyTok = getToken();
+            if (yyTok == Tok_RightParen) {
+                yyTok = getToken();
+            } else {
+                ++parenlevel;
+            }
+        } else if (yyTok == Tok_Ident) {
+            continue;
+        } else if (parenlevel == 0) {
+            return false;
+        }
+    }
+    return true;
+}
+
+static bool parseTranslate(QByteArray *text, QByteArray *context, QByteArray *comment,
+                           bool *utf8, bool *plural)
+{
+    text->clear();
+    context->clear();
+    comment->clear();
+    *utf8 = false;
+    *plural = false;
+
+    yyTok = getToken();
+    if (!match(Tok_LeftParen) || !matchString(context) || !match(Tok_Comma)
+        || !matchString(text)) {
+        return false;
+    }
+
+    if (match(Tok_RightParen))
+        return true;
+
+    // not a comma or a right paren, illegal syntax
+    if (!match(Tok_Comma))
+        return false;
+
+    // python accepts trailing commas within parenthesis, so allow a comma with nothing after
+    if (match(Tok_RightParen))
+        return true;
+
+    // check for comment
+    if (!matchStringOrNone(comment))
+        return false; // not a comment, or a trailing comma... something is wrong
+
+    if (match(Tok_RightParen))
+        return true;
+
+    // not a comma or a right paren, illegal syntax
+    if (!match(Tok_Comma))
+        return false;
+
+    // python accepts trailing commas within parenthesis, so allow a comma with nothing after
+    if (match(Tok_RightParen))
+        return true;
+
+    // look for optional encoding information
+    if (matchEncoding(utf8)) {
+        if (match(Tok_RightParen))
+            return true;
+
+        // not a comma or a right paren, illegal syntax
+        if (!match(Tok_Comma))
+            return false;
+
+        // python accepts trailing commas within parenthesis, so allow a comma with nothing after
+        if (match(Tok_RightParen))
+            return true;
+    }
+
+    // Must be a plural expression
+    if (!matchExpression())
+        return false;
+
+    *plural = true;
+
+    // Ignore any trailing comma here
+    match(Tok_Comma);
+
+    // This must be the end, or there are too many parameters
+    if (match(Tok_RightParen))
+        return true;
+
+    return false;
+}
+
+static inline void setMessageParameters(TranslatorMessage *message)
+{
+    if (!extraComment.isEmpty()) {
+        message->setExtraComment(QString::fromUtf8(extraComment));
+        extraComment.clear();
+    }
+    if (!id.isEmpty()) {
+        message->setId(QString::fromUtf8(id));
+        id.clear();
+    }
+}
+
+static void parse(Translator &tor, ConversionData &cd,
+                  const QByteArray &initialContext = {},
+                  const QByteArray &defaultContext = {})
+{
+    QByteArray context;
+    QByteArray text;
+    QByteArray comment;
+    QByteArray prefix;
+    bool utf8 = false;
+
+    yyTok = getToken();
+    while (yyTok != Tok_Eof) {
+
+        switch (yyTok) {
+            case Tok_class: {
+                if (yyIndentationSize < 0 && yyContinuousSpaceCount > 0)
+                    yyIndentationSize = yyContinuousSpaceCount; // First indented "class"
+                const int indent = yyIndentationSize > 0
+                                   ? yyContinuousSpaceCount / yyIndentationSize : 0;
+                while (!yyContextStack.isEmpty() && yyContextStack.top().second >= indent)
+                    yyContextStack.pop();
+                yyTok = getToken();
+                yyContextStack.push({yyIdent, indent});
+                yyTok = getToken();
+            }
+                break;
+            case Tok_def:
+                if (yyIndentationSize < 0 && yyContinuousSpaceCount > 0)
+                    yyIndentationSize = yyContinuousSpaceCount; // First indented "def"
+                if (!yyContextStack.isEmpty()) {
+                    // Pop classes if the function is further outdented than the class on the top
+                    // (end of a nested class).
+                    const int classIndent = yyIndentationSize > 0
+                                            ? yyContinuousSpaceCount / yyIndentationSize - 1 : 0;
+                    while (!yyContextStack.isEmpty() && yyContextStack.top().second > classIndent)
+                        yyContextStack.pop();
+                }
+                yyTok = getToken();
+                break;
+            case Tok_tr:
+            case Tok_trUtf8:
+                utf8 = true;
+                yyTok = getToken();
+                if (match(Tok_LeftParen) && matchString(&text)) {
+                    comment.clear();
+                    bool plural = false;
+
+                    if (match(Tok_RightParen)) {
+                        // There is no comment or plural arguments.
+                    } else if (match(Tok_Comma) && matchStringOrNone(&comment)) {
+                        // There is a comment argument.
+                        if (match(Tok_RightParen)) {
+                            // There is no plural argument.
+                        } else if (match(Tok_Comma)) {
+                            // There is a plural argument.
+                            plural = true;
+                        }
+                    }
+
+                    if (prefix.isEmpty())
+                        context = defaultContext;
+                    else if (prefix == "self")
+                        context = yyContextStack.isEmpty()
+                                  ? initialContext : yyContextStack.top().first;
+                    else
+                        context = prefix;
+
+                    prefix.clear();
+
+                    if (!text.isEmpty()) {
+                        TranslatorMessage message(QString::fromUtf8(context),
+                                                  QString::fromUtf8(text),
+                                                  QString::fromUtf8(comment),
+                                                  {}, yyFileName, yyLineNo,
+                                                  {}, TranslatorMessage::Unfinished, plural);
+                        setMessageParameters(&message);
+                        tor.extend(message, cd);
+                    }
+                }
+                break;
+            case Tok_translate: {
+                bool plural{};
+                if (parseTranslate(&text, &context, &comment, &utf8, &plural)
+                    && !text.isEmpty()) {
+                        TranslatorMessage message(QString::fromUtf8(context),
+                                                  QString::fromUtf8(text),
+                                                  QString::fromUtf8(comment),
+                                                  {}, yyFileName, yyLineNo,
+                                                  {}, TranslatorMessage::Unfinished, plural);
+                        setMessageParameters(&message);
+                        tor.extend(message, cd);
+                    }
+                }
+                break;
+            case Tok_Ident:
+                if (!prefix.isEmpty())
+                    prefix += '.';
+                prefix += yyIdent;
+                yyTok = getToken();
+                if (yyTok != Tok_Dot)
+                    prefix.clear();
+                break;
+            case Tok_Comment:
+                comment = yyComment;
+                comment = comment.simplified();
+                if (comment.left(sizeof(PythonMagicComment) - 1) == PythonMagicComment) {
+                    comment.remove(0, sizeof(PythonMagicComment) - 1);
+                    int k = comment.indexOf(' ');
+                    if (k == -1) {
+                        context = comment;
+                    } else {
+                        context = comment.left(k);
+                        comment.remove( 0, k + 1);
+                        TranslatorMessage message(QString::fromUtf8(context),
+                                                  {}, QString::fromUtf8(comment), {},
+                                                  yyFileName, yyLineNo, {});
+                        tor.extend(message, cd);
+                    }
+                }
+                yyTok = getToken();
+                break;
+            default:
+                yyTok = getToken();
+        }
+    }
+
+    if (yyParenDepth != 0) {
+        qWarning("%s: Unbalanced parentheses in Python code",
+                 qPrintable(yyFileName));
+    }
+}
+
+bool loadPython(Translator &translator, const QString &fileName, ConversionData &cd)
+{
+    // Match the function aliases to our tokens
+    static bool firstTime = true;
+    if (firstTime) {
+        firstTime = false;
+        const auto &nameMap  = trFunctionAliasManager.nameToTrFunctionMap();
+        for (auto it = nameMap.cbegin(), end = nameMap.cend(); it != end; ++it) {
+            switch (it.value()) {
+            case TrFunctionAliasManager::Function_tr:
+            case TrFunctionAliasManager::Function_QT_TR_NOOP:
+                tokens.insert(it.key().toUtf8(), Tok_tr);
+                break;
+            case TrFunctionAliasManager::Function_trUtf8:
+                tokens.insert(it.key().toUtf8(), Tok_trUtf8);
+                break;
+            case TrFunctionAliasManager::Function_translate:
+            case TrFunctionAliasManager::Function_QT_TRANSLATE_NOOP:
+            // QTranslator::findMessage() has the same parameters as QApplication::translate().
+            case TrFunctionAliasManager::Function_findMessage:
+                tokens.insert(it.key().toUtf8(), Tok_translate);
+                break;
+            default:
+                break;
+            }
+        }
+    }
+
+#ifdef Q_CC_MSVC
+    const auto *fileNameC = reinterpret_cast<const wchar_t *>(fileName.utf16());
+    const bool ok = _wfopen_s(&yyInFile, fileNameC, L"r") == 0;
+#else
+    const QByteArray fileNameC = QFile::encodeName(fileName);
+    yyInFile = std::fopen( fileNameC.constData(), "r");
+    const bool ok = yyInFile != nullptr;
+#endif
+    if (!ok) {
+        cd.appendError(QStringLiteral("Cannot open %1").arg(fileName));
+        return false;
+    }
+
+    startTokenizer(fileName, getCharFromFile, peekCharFromFile);
+    parse(translator, cd);
+    std::fclose(yyInFile);
+    return true;
+}
+
+QT_END_NAMESPACE