summaryrefslogtreecommitdiffstats
path: root/src/linguist/lupdate/python.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'src/linguist/lupdate/python.cpp')
-rw-r--r--src/linguist/lupdate/python.cpp777
1 files changed, 777 insertions, 0 deletions
diff --git a/src/linguist/lupdate/python.cpp b/src/linguist/lupdate/python.cpp
new file mode 100644
index 000000000..0bc3bf5e8
--- /dev/null
+++ b/src/linguist/lupdate/python.cpp
@@ -0,0 +1,777 @@
+// Copyright (C) 2002-2007 Detlev Offenbach <detlev@die-offenbachs.de>
+// Copyright (C) 2021 The Qt Company Ltd.
+// SPDX-License-Identifier: LicenseRef-Qt-Commercial OR GPL-3.0-only WITH Qt-GPL-exception-1.0
+
+#include <translator.h>
+#include "lupdate.h"
+
+#include <QtCore/qhash.h>
+#include <QtCore/qstring.h>
+#include <QtCore/qtextstream.h>
+#include <QtCore/qstack.h>
+
+#include <cctype>
+#include <cerrno>
+#include <cstdio>
+#include <cstring>
+
+QT_BEGIN_NAMESPACE
+
+static const char PythonMagicComment[] = "TRANSLATOR ";
+
+/*
+ The first part of this source file is the Python tokenizer. We skip
+ most of Python; the only tokens that interest us are defined here.
+*/
+
+enum Token { Tok_Eof, Tok_class, Tok_def, Tok_return, Tok_tr,
+ Tok_trUtf8, Tok_translate, Tok_Ident,
+ Tok_Comment, Tok_Dot, Tok_String,
+ Tok_LeftParen, Tok_RightParen,
+ Tok_Comma, Tok_None, Tok_Integer};
+
+enum class StringType
+{
+ NoString,
+ String,
+ FormatString,
+ RawString
+};
+
+/*
+ The tokenizer maintains the following global variables. The names
+ should be self-explanatory.
+*/
+static QString yyFileName;
+static int yyCh;
+static QByteArray yyIdent;
+static char yyComment[65536];
+static size_t yyCommentLen;
+static char yyString[65536];
+static size_t yyStringLen;
+static int yyParenDepth;
+static int yyLineNo;
+static int yyCurLineNo;
+
+static QByteArray extraComment;
+static QByteArray id;
+
+QHash<QByteArray, Token> tokens = {
+ {"None", Tok_None},
+ {"class", Tok_class},
+ {"def", Tok_def},
+ {"return", Tok_return},
+ {"__tr", Tok_tr}, // Legacy?
+ {"__trUtf8", Tok_trUtf8}
+};
+
+// the file to read from (if reading from a file)
+static FILE *yyInFile;
+
+// the string to read from and current position in the string (otherwise)
+static int yyInPos;
+static int buf;
+
+static int (*getChar)();
+static int (*peekChar)();
+
+static int yyIndentationSize;
+static int yyContinuousSpaceCount;
+static bool yyCountingIndentation;
+
+// (Context, indentation level) pair.
+using ContextPair = QPair<QByteArray, int>;
+// Stack of (Context, indentation level) pairs.
+using ContextStack = QStack<ContextPair>;
+static ContextStack yyContextStack;
+
+static int getCharFromFile()
+{
+ int c;
+
+ if (buf < 0) {
+ c = getc(yyInFile);
+ } else {
+ c = buf;
+ buf = -1;
+ }
+ if (c == '\n') {
+ yyCurLineNo++;
+ yyCountingIndentation = true;
+ yyContinuousSpaceCount = 0;
+ } else if (yyCountingIndentation && (c == 32 || c == 9)) {
+ yyContinuousSpaceCount++;
+ } else {
+ yyCountingIndentation = false;
+ }
+ return c;
+}
+
+static int peekCharFromFile()
+{
+ int c = getc(yyInFile);
+ buf = c;
+ return c;
+}
+
+static void startTokenizer(const QString &fileName, int (*getCharFunc)(),
+ int (*peekCharFunc)())
+{
+ yyInPos = 0;
+ buf = -1;
+ getChar = getCharFunc;
+ peekChar = peekCharFunc;
+
+ yyFileName = fileName;
+ yyCh = getChar();
+ yyParenDepth = 0;
+ yyCurLineNo = 1;
+
+ yyIndentationSize = -1;
+ yyContinuousSpaceCount = 0;
+ yyCountingIndentation = false;
+ yyContextStack.clear();
+}
+
+static bool parseStringEscape(int quoteChar, StringType stringType)
+{
+ static const char tab[] = "abfnrtv";
+ static const char backTab[] = "\a\b\f\n\r\t\v";
+
+ yyCh = getChar();
+ if (yyCh == EOF)
+ return false;
+
+ if (stringType == StringType::RawString) {
+ if (yyCh != quoteChar) // Only quotes can be escaped in raw strings
+ yyString[yyStringLen++] = '\\';
+ yyString[yyStringLen++] = yyCh;
+ yyCh = getChar();
+ return true;
+ }
+
+ if (yyCh == 'x') {
+ QByteArray hex = "0";
+ yyCh = getChar();
+ if (yyCh == EOF)
+ return false;
+ while (std::isxdigit(yyCh)) {
+ hex += char(yyCh);
+ yyCh = getChar();
+ if (yyCh == EOF)
+ return false;
+ }
+ uint n;
+#ifdef Q_CC_MSVC
+ sscanf_s(hex, "%x", &n);
+#else
+ std::sscanf(hex, "%x", &n);
+#endif
+ if (yyStringLen < sizeof(yyString) - 1)
+ yyString[yyStringLen++] = char(n);
+ return true;
+ }
+
+ if (yyCh >= '0' && yyCh < '8') {
+ QByteArray oct;
+ int n = 0;
+ do {
+ oct += char(yyCh);
+ ++n;
+ yyCh = getChar();
+ if (yyCh == EOF)
+ return false;
+ } while (yyCh >= '0' && yyCh < '8' && n < 3);
+#ifdef Q_CC_MSVC
+ sscanf_s(oct, "%o", &n);
+#else
+ std::sscanf(oct, "%o", &n);
+#endif
+ if (yyStringLen < sizeof(yyString) - 1)
+ yyString[yyStringLen++] = char(n);
+ return true;
+ }
+
+ const char *p = std::strchr(tab, yyCh);
+ if (yyStringLen < sizeof(yyString) - 1) {
+ yyString[yyStringLen++] = p == nullptr
+ ? char(yyCh) : backTab[p - tab];
+ }
+ yyCh = getChar();
+ return true;
+}
+
+static Token parseString(StringType stringType = StringType::NoString)
+{
+ int quoteChar = yyCh;
+ bool tripleQuote = false;
+ bool singleQuote = true;
+ bool in = false;
+
+ yyCh = getChar();
+
+ while (yyCh != EOF) {
+ if (singleQuote && (yyCh == '\n' || (in && yyCh == quoteChar)))
+ break;
+
+ if (yyCh == quoteChar) {
+ if (peekChar() == quoteChar) {
+ yyCh = getChar();
+ if (!tripleQuote) {
+ tripleQuote = true;
+ singleQuote = false;
+ in = true;
+ yyCh = getChar();
+ } else {
+ yyCh = getChar();
+ if (yyCh == quoteChar) {
+ tripleQuote = false;
+ break;
+ }
+ }
+ } else if (tripleQuote) {
+ if (yyStringLen < sizeof(yyString) - 1)
+ yyString[yyStringLen++] = char(yyCh);
+ yyCh = getChar();
+ continue;
+ } else {
+ break;
+ }
+ } else {
+ in = true;
+ }
+
+ if (yyCh == '\\') {
+ if (!parseStringEscape(quoteChar, stringType))
+ return Tok_Eof;
+ } else {
+ char *yStart = yyString + yyStringLen;
+ char *yp = yStart;
+ while (yyCh != EOF && (tripleQuote || yyCh != '\n') && yyCh != quoteChar
+ && yyCh != '\\') {
+ *yp++ = char(yyCh);
+ yyCh = getChar();
+ }
+ yyStringLen += yp - yStart;
+ }
+ }
+ yyString[yyStringLen] = '\0';
+
+ if (yyCh != quoteChar) {
+ printf("%c\n", yyCh);
+
+ qWarning("%s:%d: Unterminated string",
+ qPrintable(yyFileName), yyLineNo);
+ }
+
+ if (yyCh == EOF)
+ return Tok_Eof;
+ yyCh = getChar();
+ return Tok_String;
+}
+
+static QByteArray readLine()
+{
+ QByteArray result;
+ while (true) {
+ yyCh = getChar();
+ if (yyCh == EOF || yyCh == '\n')
+ break;
+ result.append(char(yyCh));
+ }
+ return result;
+}
+
+static Token getToken(StringType stringType = StringType::NoString)
+{
+ yyIdent.clear();
+ yyCommentLen = 0;
+ yyStringLen = 0;
+ while (yyCh != EOF) {
+ yyLineNo = yyCurLineNo;
+
+ if (std::isalpha(yyCh) || yyCh == '_') {
+ do {
+ yyIdent.append(char(yyCh));
+ yyCh = getChar();
+ } while (std::isalnum(yyCh) || yyCh == '_');
+
+ return tokens.value(yyIdent, Tok_Ident);
+ }
+ switch (yyCh) {
+ case '#':
+ switch (getChar()) {
+ case ':':
+ extraComment = readLine().trimmed();
+ break;
+ case '=':
+ id = readLine().trimmed();
+ break;
+ case EOF:
+ return Tok_Eof;
+ case '\n':
+ break;
+ default:
+ do {
+ yyCh = getChar();
+ } while (yyCh != EOF && yyCh != '\n');
+ break;
+ }
+ break;
+ case '"':
+ case '\'':
+ return parseString(stringType);
+ case '(':
+ yyParenDepth++;
+ yyCh = getChar();
+ return Tok_LeftParen;
+ case ')':
+ yyParenDepth--;
+ yyCh = getChar();
+ return Tok_RightParen;
+ case ',':
+ yyCh = getChar();
+ return Tok_Comma;
+ case '.':
+ yyCh = getChar();
+ return Tok_Dot;
+ case '0':
+ case '1':
+ case '2':
+ case '3':
+ case '4':
+ case '5':
+ case '6':
+ case '7':
+ case '8':
+ case '9': {
+ QByteArray ba;
+ ba += char(yyCh);
+ yyCh = getChar();
+ const bool hex = yyCh == 'x';
+ if (hex) {
+ ba += char(yyCh);
+ yyCh = getChar();
+ }
+ while ((hex ? std::isxdigit(yyCh) : std::isdigit(yyCh))) {
+ ba += char(yyCh);
+ yyCh = getChar();
+ }
+ bool ok;
+ auto v = ba.toLongLong(&ok);
+ Q_UNUSED(v);
+ if (ok)
+ return Tok_Integer;
+ break;
+ }
+ default:
+ yyCh = getChar();
+ }
+ }
+ return Tok_Eof;
+}
+
+/*
+ The second part of this source file is the parser. It accomplishes
+ a very easy task: It finds all strings inside a tr() or translate()
+ call, and possibly finds out the context of the call. It supports
+ three cases:
+ (1) the context is specified, as in FunnyDialog.tr("Hello") or
+ translate("FunnyDialog", "Hello");
+ (2) the call appears within an inlined function;
+ (3) the call appears within a function defined outside the class definition.
+*/
+
+static Token yyTok;
+
+static bool match(Token t)
+{
+ const bool matches = (yyTok == t);
+ if (matches)
+ yyTok = getToken();
+ return matches;
+}
+
+static bool matchStringStart()
+{
+ if (yyTok == Tok_String)
+ return true;
+ // Check for f"bla{var}" and raw strings r"bla".
+ if (yyTok == Tok_Ident && yyIdent.size() == 1) {
+ switch (yyIdent.at(0)) {
+ case 'r':
+ yyTok = getToken(StringType::RawString);
+ return yyTok == Tok_String;
+ case 'f':
+ yyTok = getToken(StringType::FormatString);
+ return yyTok == Tok_String;
+ }
+ }
+ return false;
+}
+
+static bool matchString(QByteArray *s)
+{
+ s->clear();
+ bool ok = false;
+ while (matchStringStart()) {
+ *s += yyString;
+ yyTok = getToken();
+ ok = true;
+ }
+ return ok;
+}
+
+static bool matchEncoding(bool *utf8)
+{
+ // Remove any leading module paths.
+ if (yyTok == Tok_Ident && std::strcmp(yyIdent, "PySide6") == 0) {
+ yyTok = getToken();
+
+ if (yyTok != Tok_Dot)
+ return false;
+
+ yyTok = getToken();
+ }
+
+ if (yyTok == Tok_Ident && (std::strcmp(yyIdent, "QtGui") == 0
+ || std::strcmp(yyIdent, "QtCore") == 0)) {
+ yyTok = getToken();
+
+ if (yyTok != Tok_Dot)
+ return false;
+
+ yyTok = getToken();
+ }
+
+ if (yyTok == Tok_Ident) {
+ if (std::strcmp(yyIdent, "QApplication") == 0
+ || std::strcmp(yyIdent, "QGuiApplication") == 0
+ || std::strcmp(yyIdent, "QCoreApplication") == 0) {
+ yyTok = getToken();
+
+ if (yyTok == Tok_Dot)
+ yyTok = getToken();
+ }
+
+ *utf8 = QByteArray(yyIdent).endsWith("UTF8");
+ yyTok = getToken();
+ return true;
+ }
+ return false;
+}
+
+static bool matchStringOrNone(QByteArray *s)
+{
+ bool matches = matchString(s);
+
+ if (!matches)
+ matches = match(Tok_None);
+
+ return matches;
+}
+
+/*
+ * match any expression that can return a number, which can be
+ * 1. Literal number (e.g. '11')
+ * 2. simple identifier (e.g. 'm_count')
+ * 3. simple function call (e.g. 'size()')
+ * 4. function call on an object (e.g. 'list.size()')
+ *
+ * Other cases:
+ * size(2,4)
+ * list().size()
+ * list(a,b).size(2,4)
+ * etc...
+ */
+static bool matchExpression()
+{
+ if (match(Tok_Integer))
+ return true;
+
+ int parenlevel = 0;
+ while (match(Tok_Ident) || parenlevel > 0) {
+ if (yyTok == Tok_RightParen) {
+ if (parenlevel == 0)
+ break;
+ --parenlevel;
+ yyTok = getToken();
+ } else if (yyTok == Tok_LeftParen) {
+ yyTok = getToken();
+ if (yyTok == Tok_RightParen) {
+ yyTok = getToken();
+ } else {
+ ++parenlevel;
+ }
+ } else if (yyTok == Tok_Ident) {
+ continue;
+ } else if (parenlevel == 0) {
+ return false;
+ }
+ }
+ return true;
+}
+
+static bool parseTranslate(QByteArray *text, QByteArray *context, QByteArray *comment,
+ bool *utf8, bool *plural)
+{
+ text->clear();
+ context->clear();
+ comment->clear();
+ *utf8 = false;
+ *plural = false;
+
+ yyTok = getToken();
+ if (!match(Tok_LeftParen) || !matchString(context) || !match(Tok_Comma)
+ || !matchString(text)) {
+ return false;
+ }
+
+ if (match(Tok_RightParen))
+ return true;
+
+ // not a comma or a right paren, illegal syntax
+ if (!match(Tok_Comma))
+ return false;
+
+ // python accepts trailing commas within parenthesis, so allow a comma with nothing after
+ if (match(Tok_RightParen))
+ return true;
+
+ // check for comment
+ if (!matchStringOrNone(comment))
+ return false; // not a comment, or a trailing comma... something is wrong
+
+ if (match(Tok_RightParen))
+ return true;
+
+ // not a comma or a right paren, illegal syntax
+ if (!match(Tok_Comma))
+ return false;
+
+ // python accepts trailing commas within parenthesis, so allow a comma with nothing after
+ if (match(Tok_RightParen))
+ return true;
+
+ // look for optional encoding information
+ if (matchEncoding(utf8)) {
+ if (match(Tok_RightParen))
+ return true;
+
+ // not a comma or a right paren, illegal syntax
+ if (!match(Tok_Comma))
+ return false;
+
+ // python accepts trailing commas within parenthesis, so allow a comma with nothing after
+ if (match(Tok_RightParen))
+ return true;
+ }
+
+ // Must be a plural expression
+ if (!matchExpression())
+ return false;
+
+ *plural = true;
+
+ // Ignore any trailing comma here
+ match(Tok_Comma);
+
+ // This must be the end, or there are too many parameters
+ if (match(Tok_RightParen))
+ return true;
+
+ return false;
+}
+
+static inline void setMessageParameters(TranslatorMessage *message)
+{
+ if (!extraComment.isEmpty()) {
+ message->setExtraComment(QString::fromUtf8(extraComment));
+ extraComment.clear();
+ }
+ if (!id.isEmpty()) {
+ message->setId(QString::fromUtf8(id));
+ id.clear();
+ }
+}
+
+static void parse(Translator &tor, ConversionData &cd,
+ const QByteArray &initialContext = {},
+ const QByteArray &defaultContext = {})
+{
+ QByteArray context;
+ QByteArray text;
+ QByteArray comment;
+ QByteArray prefix;
+ bool utf8 = false;
+
+ yyTok = getToken();
+ while (yyTok != Tok_Eof) {
+
+ switch (yyTok) {
+ case Tok_class: {
+ if (yyIndentationSize < 0 && yyContinuousSpaceCount > 0)
+ yyIndentationSize = yyContinuousSpaceCount; // First indented "class"
+ const int indent = yyIndentationSize > 0
+ ? yyContinuousSpaceCount / yyIndentationSize : 0;
+ while (!yyContextStack.isEmpty() && yyContextStack.top().second >= indent)
+ yyContextStack.pop();
+ yyTok = getToken();
+ yyContextStack.push({yyIdent, indent});
+ yyTok = getToken();
+ }
+ break;
+ case Tok_def:
+ if (yyIndentationSize < 0 && yyContinuousSpaceCount > 0)
+ yyIndentationSize = yyContinuousSpaceCount; // First indented "def"
+ if (!yyContextStack.isEmpty()) {
+ // Pop classes if the function is further outdented than the class on the top
+ // (end of a nested class).
+ const int classIndent = yyIndentationSize > 0
+ ? yyContinuousSpaceCount / yyIndentationSize - 1 : 0;
+ while (!yyContextStack.isEmpty() && yyContextStack.top().second > classIndent)
+ yyContextStack.pop();
+ }
+ yyTok = getToken();
+ break;
+ case Tok_tr:
+ case Tok_trUtf8:
+ utf8 = true;
+ yyTok = getToken();
+ if (match(Tok_LeftParen) && matchString(&text)) {
+ comment.clear();
+ bool plural = false;
+
+ if (match(Tok_RightParen)) {
+ // There is no comment or plural arguments.
+ } else if (match(Tok_Comma) && matchStringOrNone(&comment)) {
+ // There is a comment argument.
+ if (match(Tok_RightParen)) {
+ // There is no plural argument.
+ } else if (match(Tok_Comma)) {
+ // There is a plural argument.
+ plural = true;
+ }
+ }
+
+ if (prefix.isEmpty())
+ context = defaultContext;
+ else if (prefix == "self")
+ context = yyContextStack.isEmpty()
+ ? initialContext : yyContextStack.top().first;
+ else
+ context = prefix;
+
+ prefix.clear();
+
+ if (!text.isEmpty()) {
+ TranslatorMessage message(QString::fromUtf8(context),
+ QString::fromUtf8(text),
+ QString::fromUtf8(comment),
+ {}, yyFileName, yyLineNo,
+ {}, TranslatorMessage::Unfinished, plural);
+ setMessageParameters(&message);
+ tor.extend(message, cd);
+ }
+ }
+ break;
+ case Tok_translate: {
+ bool plural{};
+ if (parseTranslate(&text, &context, &comment, &utf8, &plural)
+ && !text.isEmpty()) {
+ TranslatorMessage message(QString::fromUtf8(context),
+ QString::fromUtf8(text),
+ QString::fromUtf8(comment),
+ {}, yyFileName, yyLineNo,
+ {}, TranslatorMessage::Unfinished, plural);
+ setMessageParameters(&message);
+ tor.extend(message, cd);
+ }
+ }
+ break;
+ case Tok_Ident:
+ if (!prefix.isEmpty())
+ prefix += '.';
+ prefix += yyIdent;
+ yyTok = getToken();
+ if (yyTok != Tok_Dot)
+ prefix.clear();
+ break;
+ case Tok_Comment:
+ comment = yyComment;
+ comment = comment.simplified();
+ if (comment.left(sizeof(PythonMagicComment) - 1) == PythonMagicComment) {
+ comment.remove(0, sizeof(PythonMagicComment) - 1);
+ int k = comment.indexOf(' ');
+ if (k == -1) {
+ context = comment;
+ } else {
+ context = comment.left(k);
+ comment.remove( 0, k + 1);
+ TranslatorMessage message(QString::fromUtf8(context),
+ {}, QString::fromUtf8(comment), {},
+ yyFileName, yyLineNo, {});
+ tor.extend(message, cd);
+ }
+ }
+ yyTok = getToken();
+ break;
+ default:
+ yyTok = getToken();
+ }
+ }
+
+ if (yyParenDepth != 0) {
+ qWarning("%s: Unbalanced parentheses in Python code",
+ qPrintable(yyFileName));
+ }
+}
+
+bool loadPython(Translator &translator, const QString &fileName, ConversionData &cd)
+{
+ // Match the function aliases to our tokens
+ static bool firstTime = true;
+ if (firstTime) {
+ firstTime = false;
+ const auto &nameMap = trFunctionAliasManager.nameToTrFunctionMap();
+ for (auto it = nameMap.cbegin(), end = nameMap.cend(); it != end; ++it) {
+ switch (it.value()) {
+ case TrFunctionAliasManager::Function_tr:
+ case TrFunctionAliasManager::Function_QT_TR_NOOP:
+ tokens.insert(it.key().toUtf8(), Tok_tr);
+ break;
+ case TrFunctionAliasManager::Function_trUtf8:
+ tokens.insert(it.key().toUtf8(), Tok_trUtf8);
+ break;
+ case TrFunctionAliasManager::Function_translate:
+ case TrFunctionAliasManager::Function_QT_TRANSLATE_NOOP:
+ // QTranslator::findMessage() has the same parameters as QApplication::translate().
+ case TrFunctionAliasManager::Function_findMessage:
+ tokens.insert(it.key().toUtf8(), Tok_translate);
+ break;
+ default:
+ break;
+ }
+ }
+ }
+
+#ifdef Q_CC_MSVC
+ const auto *fileNameC = reinterpret_cast<const wchar_t *>(fileName.utf16());
+ const bool ok = _wfopen_s(&yyInFile, fileNameC, L"r") == 0;
+#else
+ const QByteArray fileNameC = QFile::encodeName(fileName);
+ yyInFile = std::fopen( fileNameC.constData(), "r");
+ const bool ok = yyInFile != nullptr;
+#endif
+ if (!ok) {
+ cd.appendError(QStringLiteral("Cannot open %1").arg(fileName));
+ return false;
+ }
+
+ startTokenizer(fileName, getCharFromFile, peekCharFromFile);
+ parse(translator, cd);
+ std::fclose(yyInFile);
+ return true;
+}
+
+QT_END_NAMESPACE