summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorShawn Rutledge <shawn.rutledge@qt.io>2024-03-01 00:39:50 -0700
committerShawn Rutledge <shawn.rutledge@qt.io>2024-03-26 00:47:37 -0700
commit0281005a711c3635114ba92f778d0e9c8a89027d (patch)
tree5705f86f23c9ea20105c1d447b0cf73ba74b73fc
parentae8031b5e72032f9e2884c18cd72639acfd0d1a4 (diff)
QTextMarkdownWriter: escape all backslashes
A literal backslash needs to be doubled so that the parser doesn't treat it as escaping the following character when the markdown is read back. In ca4774131b9b8ee40b4d7f5c1ba296af4700207f we tried to limit it to backslashes that were not already escaped. In case someone really needs a longer series of backslashes, it's more correct to escape them all; but this comes with the risk that if they do not get un-escaped by the markdown parser in some scenario, repeated round-trip saving and loading could multiply them excessively. So we also add a lot of tests to try to verify that this is safe. Task-number: QTBUG-96051 Fixes: QTBUG-122083 Pick-to: 6.7 Change-Id: I64f610d24e99f67ebdc30d5ab5c6cf3985aec5ec Reviewed-by: Axel Spoerl <axel.spoerl@qt.io>
-rw-r--r--src/gui/text/qtextmarkdownwriter.cpp13
-rw-r--r--tests/auto/gui/text/qtextmarkdownimporter/tst_qtextmarkdownimporter.cpp102
-rw-r--r--tests/auto/gui/text/qtextmarkdownwriter/tst_qtextmarkdownwriter.cpp103
3 files changed, 208 insertions, 10 deletions
diff --git a/src/gui/text/qtextmarkdownwriter.cpp b/src/gui/text/qtextmarkdownwriter.cpp
index e65709bfac..361158e722 100644
--- a/src/gui/text/qtextmarkdownwriter.cpp
+++ b/src/gui/text/qtextmarkdownwriter.cpp
@@ -316,25 +316,20 @@ static void maybeEscapeFirstChar(QString &s)
}
/*! \internal
- Escape unescaped backslashes. Then escape any special character that stands
+ Escape all backslashes. Then escape any special character that stands
alone or prefixes a "word", including the \c < that starts an HTML tag.
https://spec.commonmark.org/0.31.2/#backslash-escapes
*/
static void escapeSpecialCharacters(QString &s)
{
- static const QRegularExpression backslashRe(uR"([^\\]\\)"_s);
static const QRegularExpression spaceRe(uR"(\s+)"_s);
static const QRegularExpression specialRe(uR"([<!*[`&]+[/\w])"_s);
+ s.replace("\\"_L1, "\\\\"_L1);
+
int i = 0;
while (i >= 0) {
- if (int j = s.indexOf(backslashRe, i); j >= 0) {
- ++j; // we found some char before the backslash that needs escaping
- if (s.size() == j + 1 || s.at(j + 1) != qtmw_Backslash)
- s.insert(j, qtmw_Backslash);
- i = j + 3;
- }
- if (int j = s.indexOf(specialRe, i); j >= 0 && (j == 0 || s.at(j - 1) != u'\\')) {
+ if (int j = s.indexOf(specialRe, i); j >= 0) {
s.insert(j, qtmw_Backslash);
i = j + 3;
}
diff --git a/tests/auto/gui/text/qtextmarkdownimporter/tst_qtextmarkdownimporter.cpp b/tests/auto/gui/text/qtextmarkdownimporter/tst_qtextmarkdownimporter.cpp
index 645dd9c102..d9fe000253 100644
--- a/tests/auto/gui/text/qtextmarkdownimporter/tst_qtextmarkdownimporter.cpp
+++ b/tests/auto/gui/text/qtextmarkdownimporter/tst_qtextmarkdownimporter.cpp
@@ -46,6 +46,8 @@ private slots:
void fencedCodeBlocks();
void frontMatter_data();
void frontMatter();
+ void toRawText_data();
+ void toRawText();
private:
bool isMainFontFixed();
@@ -576,6 +578,10 @@ void tst_QTextMarkdownImporter::fencedCodeBlocks_data()
<< "```pseudocode\nprint('hello world\\n')\n```\n"
<< 1 << 0 << "pseudocode" << "`"
<< "```pseudocode\nprint('hello world\\n')\n```\n\n";
+ QTest::newRow("backtick fence with punctuated language")
+ << "```html+js\n<html><head><script>function hi() { console.log('\\\"hello world') }</script></head>blah</html>\n```\n"
+ << 1 << 0 << "html+js" << "`"
+ << "```html+js\n<html><head><script>function hi() { console.log('\\\"hello world') }</script></head>blah</html>\n```\n\n";
QTest::newRow("tilde fence with language")
<< "~~~pseudocode\nprint('hello world\\n')\n~~~\n"
<< 1 << 0 << "pseudocode" << "~"
@@ -669,5 +675,101 @@ void tst_QTextMarkdownImporter::frontMatter()
QCOMPARE(doc.metaInformation(QTextDocument::FrontMatter), yaml); // without fences
}
+void tst_QTextMarkdownImporter::toRawText_data()
+{
+ QTest::addColumn<QString>("input");
+ QTest::addColumn<QString>("expectedRawText");
+
+ // tests to verify that fixing QTBUG-122083 is safe
+ // https://spec.commonmark.org/0.31.2/#example-12
+ QTest::newRow("punctuation backslash escapes") <<
+ R"(\!\"\#\$\%\&\'\(\)\*\+\,\-\.\/\:\;\<\=\>\?\@\[\\\]\^\_\`\{\|\}\~)" <<
+ R"(!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~)";
+ // https://spec.commonmark.org/0.31.2/#example-13
+ QTest::newRow("literal backslashes") <<
+ QString(uR"(\→\A\a\ \3\φ\«)") <<
+ QString(uR"(\→\A\a\ \3\φ\«)");
+ // https://spec.commonmark.org/0.31.2/#example-14
+ QTest::newRow("escape to avoid em") <<
+ R"(\*not emphasized*)" <<
+ R"(*not emphasized*)";
+ QTest::newRow("escape to avoid html") <<
+ R"(\<br/> not a tag)" <<
+ R"(<br/> not a tag)";
+ QTest::newRow("escape to avoid link") <<
+ R"(\[not a link](/foo))" <<
+ R"([not a link](/foo))";
+ QTest::newRow("escape to avoid mono") <<
+ R"(\`not code`)" <<
+ R"(`not code`)";
+ QTest::newRow("escape to avoid num list") <<
+ R"(1\. not a list)" <<
+ R"(1. not a list)";
+ QTest::newRow("escape to avoid list") <<
+ R"(\* not a list)" <<
+ R"(* not a list)";
+ QTest::newRow("escape to avoid heading") <<
+ R"(\# not a heading)" <<
+ R"(# not a heading)";
+ QTest::newRow("escape to avoid reflink") <<
+ R"(\[foo]: /url "not a reference")" <<
+ R"([foo]: /url "not a reference")";
+ QTest::newRow("escape to avoid entity") <<
+ R"(\&ouml; not a character entity)" <<
+ R"(&ouml; not a character entity)";
+ // https://spec.commonmark.org/0.31.2/#example-15
+ QTest::newRow("escape backslash only") <<
+ R"(\\*emphasis*)" <<
+ R"(\emphasis)";
+ // https://spec.commonmark.org/0.31.2/#example-16
+ QTest::newRow("backslash line break") <<
+ "foo\\\nbar" <<
+ "foo\u2029bar";
+ // https://spec.commonmark.org/0.31.2/#example-17
+ QTest::newRow("backslash in mono span") <<
+ R"(`` \[\` ``)" <<
+ R"(\[\`)";
+ // https://spec.commonmark.org/0.31.2/#example-18
+ QTest::newRow("backslash in indented code") <<
+ R"( \[\])" <<
+ R"(\[\])";
+ // https://spec.commonmark.org/0.31.2/#example-19
+ QTest::newRow("backslash in fenced code") <<
+ "~~~\n\\[\\]\n~~~" <<
+ R"(\[\])";
+ // https://spec.commonmark.org/0.31.2/#example-20
+ QTest::newRow("backslash in autolink") <<
+ R"(<https://example.com?find=\*>)" <<
+ R"(https://example.com?find=\*)";
+ // https://spec.commonmark.org/0.31.2/#example-21
+ QTest::newRow("backslash in autolink") <<
+ "<a href=\"/bar\\/)\"" <<
+ "<a href=\"/bar/)\"";
+ // https://spec.commonmark.org/0.31.2/#example-22
+ QTest::newRow("escapes in link") <<
+ R"([foo](/bar\* "ti\*tle"))" <<
+ R"(foo)";
+ // https://spec.commonmark.org/0.31.2/#example-24
+ QTest::newRow("backslash in code lang") <<
+ "```\nfoo\\+bar\nfoo\n```" <<
+ "foo\\+bar\u2029foo";
+ // end of tests to verify that fixing QTBUG-122083 is safe
+ // (it's ok to add unrelated markdown-to-rawtext cases later)
+}
+
+void tst_QTextMarkdownImporter::toRawText()
+{
+ QFETCH(QString, input);
+ QFETCH(QString, expectedRawText);
+
+ QTextDocument doc;
+ doc.setMarkdown(input);
+
+ // These are testing md4c more than Qt, so any change may be an md4c bug, or a fix
+ QCOMPARE(doc.toRawText(), expectedRawText);
+ if (doc.blockCount() == 1)
+ QCOMPARE(doc.firstBlock().text(), expectedRawText);
+}
+
QTEST_MAIN(tst_QTextMarkdownImporter)
#include "tst_qtextmarkdownimporter.moc"
diff --git a/tests/auto/gui/text/qtextmarkdownwriter/tst_qtextmarkdownwriter.cpp b/tests/auto/gui/text/qtextmarkdownwriter/tst_qtextmarkdownwriter.cpp
index d0ae34d67d..0d261bc27e 100644
--- a/tests/auto/gui/text/qtextmarkdownwriter/tst_qtextmarkdownwriter.cpp
+++ b/tests/auto/gui/text/qtextmarkdownwriter/tst_qtextmarkdownwriter.cpp
@@ -45,6 +45,8 @@ private slots:
void rewriteDocument();
void fromHtml_data();
void fromHtml();
+ void fromPlainTextAndBack_data();
+ void fromPlainTextAndBack();
void escapeSpecialCharacters_data();
void escapeSpecialCharacters();
@@ -839,6 +841,27 @@ void tst_QTextMarkdownWriter::fromHtml_data()
QTest::newRow("table with backslash in cell") << // QTBUG-96051
"<table><tr><td>1011011 [</td><td>1011100 backslash \\</td></tr></table>" <<
"|1011011 [|1011100 backslash \\\\|";
+ // https://spec.commonmark.org/0.31.2/#example-12
+ // escaping punctuation is ok, but QTextMarkdownWriter currently doesn't do that (which is also ok)
+ QTest::newRow("punctuation") <<
+ R"(<p>!&quot;#$%&amp;'()*+,-./:;&lt;=&gt;?@[\]^_`{|}~</p>)" <<
+ R"(!"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~)";
+ // https://spec.commonmark.org/0.31.2/#example-14
+ QTest::newRow("backslash asterisk no emphasis") << // QTBUG-122083
+ R"(\*no emphasis*)" <<
+ R"(\\\*no emphasis*)";
+ // https://spec.commonmark.org/0.31.2/#example-15
+ QTest::newRow("backslash before emphasis") <<
+ R"(\<em>emphasis</em>)" <<
+ R"(\\*emphasis*)";
+ // https://spec.commonmark.org/0.31.2/#example-20
+ QTest::newRow("backslash-asterisk in autolink") <<
+ R"(<p><a href="https://example.com?find=\\*">https://example.com?find=\*</a></p>)" <<
+ R"(<https://example.com?find=\\*>)";
+ // https://spec.commonmark.org/0.31.2/#example-24
+ QTest::newRow("plus in fenced code lang") <<
+ "<pre class=\"language-foo+bar\">foo</pre>" <<
+ "```foo+bar\nfoo\n```";
}
void tst_QTextMarkdownWriter::fromHtml()
@@ -865,12 +888,90 @@ void tst_QTextMarkdownWriter::fromHtml()
QCOMPARE(output, expectedOutput);
}
+void tst_QTextMarkdownWriter::fromPlainTextAndBack_data()
+{
+ QTest::addColumn<QString>("input");
+ QTest::addColumn<QString>("expectedMarkdown");
+
+ // tests to verify that fixing QTBUG-122083 is safe
+ QTest::newRow("single backslashes") <<
+ R"(\ again: \ not esc: \* \-\-\ \*abc*)" <<
+ R"(\\ again: \\ not esc: \\* \\-\\-\\ \\\*abc*)";
+ // https://spec.commonmark.org/0.31.2/#example-12
+ QTest::newRow("punctuation") <<
+ R"(!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~)" <<
+ R"(!"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~)";
+ // https://spec.commonmark.org/0.31.2/#example-13
+ QTest::newRow("literal backslashes") <<
+ QString(uR"(\→\A\a\ \3\φ\«)") <<
+ "\\\\\u2192\\\\A\\\\a\\\\ \\\\3\\\\\u03C6\\\\\u00AB";
+ // https://spec.commonmark.org/0.31.2/#example-14
+ QTest::newRow("escape to avoid em") <<
+ R"(*not emphasized*)" <<
+ R"(\*not emphasized*)";
+ QTest::newRow("escape to avoid html") <<
+ R"(<br/> not a tag)" <<
+ R"(\<br/> not a tag)";
+ QTest::newRow("escape to avoid link") <<
+ R"([not a link](/foo))" <<
+ R"(\[not a link](/foo))";
+ QTest::newRow("escape to avoid mono") <<
+ R"(`not code`)" <<
+ R"(\`not code`)";
+ QTest::newRow("escape to avoid num list") <<
+ R"(1. not a list)" <<
+ R"(1\. not a list)";
+ QTest::newRow("escape to avoid list") <<
+ R"(* not a list)" <<
+ R"(\* not a list)";
+ QTest::newRow("escape to avoid heading") <<
+ R"(# not a heading)" <<
+ R"(\# not a heading)";
+ QTest::newRow("escape to avoid reflink") <<
+ R"([foo]: /url "not a reference")" <<
+ R"(\[foo]: /url "not a reference")";
+ QTest::newRow("escape to avoid entity") <<
+ R"(&ouml; not a character entity)" <<
+ R"(\&ouml; not a character entity)";
+ // end of tests to verify that fixing QTBUG-122083 is safe
+ // (it's ok to add unrelated plain-to-markdown-to-plaintext cases later)
+}
+
+void tst_QTextMarkdownWriter::fromPlainTextAndBack()
+{
+ QFETCH(QString, input);
+ QFETCH(QString, expectedMarkdown);
+
+ document->setPlainText(input);
+ QString output = documentToUnixMarkdown();
+
+#ifdef DEBUG_WRITE_OUTPUT
+ {
+ QFile out("/tmp/" + QLatin1String(QTest::currentDataTag()) + ".md");
+ out.open(QFile::WriteOnly);
+ out.write(output.toUtf8());
+ out.close();
+ }
+#endif
+
+ output = output.trimmed();
+ expectedMarkdown = expectedMarkdown.trimmed();
+ if (output != expectedMarkdown && (isMainFontFixed() || isFixedFontProportional()))
+ QSKIP("", "fixed main font or proportional fixed font (QTBUG-103484)");
+ QCOMPARE(output, expectedMarkdown);
+ QCOMPARE(document->toPlainText(), input);
+ document->setMarkdown(output);
+ QCOMPARE(document->toPlainText(), input);
+ if (document->blockCount() == 1)
+ QCOMPARE(document->firstBlock().text(), input);
+}
+
void tst_QTextMarkdownWriter::escapeSpecialCharacters_data()
{
QTest::addColumn<QString>("input");
QTest::addColumn<QString>("expectedOutput");
- QTest::newRow("backslash") << "foo \\ bar \\\\ baz \\" << "foo \\\\ bar \\\\ baz \\\\";
+ QTest::newRow("backslash") << "foo \\ bar \\\\ baz \\" << "foo \\\\ bar \\\\\\\\ baz \\\\";
QTest::newRow("not emphasized") << "*normal* **normal too**" << "\\*normal* \\**normal too**";
QTest::newRow("not code") << "`normal` `normal too`" << "\\`normal` \\`normal too`";
QTest::newRow("code fence") << "```not a fence; ``` no risk here; ```not a fence" // TODO slightly inconsistent