From 662dbcb8a9efcdda798b0666eb3db1b82aef1ea8 Mon Sep 17 00:00:00 2001 From: Shawn Rutledge Date: Fri, 1 Mar 2024 00:39:50 -0700 Subject: [PATCH] QTextMarkdownWriter: escape all backslashes A literal backslash needs to be doubled so that the parser doesn't treat it as escaping the following character when the markdown is read back. In ca4774131b9b8ee40b4d7f5c1ba296af4700207f we tried to limit it to backslashes that were not already escaped. In case someone really needs a longer series of backslashes, it's more correct to escape them all; but this comes with the risk that if they do not get un-escaped by the markdown parser in some scenario, repeated round-trip saving and loading could multiply them excessively. So we also add a lot of tests to try to verify that this is safe. Task-number: QTBUG-96051 Fixes: QTBUG-122083 Change-Id: I64f610d24e99f67ebdc30d5ab5c6cf3985aec5ec Reviewed-by: Axel Spoerl (cherry picked from commit 0281005a711c3635114ba92f778d0e9c8a89027d) Reviewed-by: Shawn Rutledge --- src/gui/text/qtextmarkdownwriter.cpp | 13 +-- .../tst_qtextmarkdownimporter.cpp | 102 +++++++++++++++++ .../tst_qtextmarkdownwriter.cpp | 103 +++++++++++++++++- 3 files changed, 208 insertions(+), 10 deletions(-) diff --git a/src/gui/text/qtextmarkdownwriter.cpp b/src/gui/text/qtextmarkdownwriter.cpp index 3c05fed6a69..1acee65db69 100644 --- a/src/gui/text/qtextmarkdownwriter.cpp +++ b/src/gui/text/qtextmarkdownwriter.cpp @@ -301,25 +301,20 @@ static void maybeEscapeFirstChar(QString &s) } /*! \internal - Escape unescaped backslashes. Then escape any special character that stands + Escape all backslashes. Then escape any special character that stands alone or prefixes a "word", including the \c < that starts an HTML tag. https://spec.commonmark.org/0.31.2/#backslash-escapes */ static void escapeSpecialCharacters(QString &s) { - static const QRegularExpression backslashRe(uR"([^\\]\\)"_s); static const QRegularExpression spaceRe(uR"(\s+)"_s); static const QRegularExpression specialRe(uR"([= 0) { - if (int j = s.indexOf(backslashRe, i); j >= 0) { - ++j; // we found some char before the backslash that needs escaping - if (s.size() == j + 1 || s.at(j + 1) != qtmw_Backslash) - s.insert(j, qtmw_Backslash); - i = j + 3; - } - if (int j = s.indexOf(specialRe, i); j >= 0 && (j == 0 || s.at(j - 1) != u'\\')) { + if (int j = s.indexOf(specialRe, i); j >= 0) { s.insert(j, qtmw_Backslash); i = j + 3; } diff --git a/tests/auto/gui/text/qtextmarkdownimporter/tst_qtextmarkdownimporter.cpp b/tests/auto/gui/text/qtextmarkdownimporter/tst_qtextmarkdownimporter.cpp index 0c889947a83..c0aa37286af 100644 --- a/tests/auto/gui/text/qtextmarkdownimporter/tst_qtextmarkdownimporter.cpp +++ b/tests/auto/gui/text/qtextmarkdownimporter/tst_qtextmarkdownimporter.cpp @@ -43,6 +43,8 @@ private slots: void pathological(); void fencedCodeBlocks_data(); void fencedCodeBlocks(); + void toRawText_data(); + void toRawText(); private: bool isMainFontFixed(); @@ -536,6 +538,10 @@ void tst_QTextMarkdownImporter::fencedCodeBlocks_data() << "```pseudocode\nprint('hello world\\n')\n```\n" << 1 << 0 << "pseudocode" << "`" << "```pseudocode\nprint('hello world\\n')\n```\n\n"; + QTest::newRow("backtick fence with punctuated language") + << "```html+js\nblah\n```\n" + << 1 << 0 << "html+js" << "`" + << "```html+js\nblah\n```\n\n"; QTest::newRow("tilde fence with language") << "~~~pseudocode\nprint('hello world\\n')\n~~~\n" << 1 << 0 << "pseudocode" << "~" @@ -595,5 +601,101 @@ void tst_QTextMarkdownImporter::fencedCodeBlocks() QCOMPARE(doc.toMarkdown(), rewrite); } +void tst_QTextMarkdownImporter::toRawText_data() +{ + QTest::addColumn("input"); + QTest::addColumn("expectedRawText"); + + // tests to verify that fixing QTBUG-122083 is safe + // https://spec.commonmark.org/0.31.2/#example-12 + QTest::newRow("punctuation backslash escapes") << + R"(\!\"\#\$\%\&\'\(\)\*\+\,\-\.\/\:\;\<\=\>\?\@\[\\\]\^\_\`\{\|\}\~)" << + R"(!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~)"; + // https://spec.commonmark.org/0.31.2/#example-13 + QTest::newRow("literal backslashes") << + QString::fromUtf16(uR"(\→\A\a\ \3\φ\«)") << + QString::fromUtf16(uR"(\→\A\a\ \3\φ\«)"); + // https://spec.commonmark.org/0.31.2/#example-14 + QTest::newRow("escape to avoid em") << + R"(\*not emphasized*)" << + R"(*not emphasized*)"; + QTest::newRow("escape to avoid html") << + R"(\
not a tag)" << + R"(
not a tag)"; + QTest::newRow("escape to avoid link") << + R"(\[not a link](/foo))" << + R"([not a link](/foo))"; + QTest::newRow("escape to avoid mono") << + R"(\`not code`)" << + R"(`not code`)"; + QTest::newRow("escape to avoid num list") << + R"(1\. not a list)" << + R"(1. not a list)"; + QTest::newRow("escape to avoid list") << + R"(\* not a list)" << + R"(* not a list)"; + QTest::newRow("escape to avoid heading") << + R"(\# not a heading)" << + R"(# not a heading)"; + QTest::newRow("escape to avoid reflink") << + R"(\[foo]: /url "not a reference")" << + R"([foo]: /url "not a reference")"; + QTest::newRow("escape to avoid entity") << + R"(\ö not a character entity)" << + R"(ö not a character entity)"; + // https://spec.commonmark.org/0.31.2/#example-15 + QTest::newRow("escape backslash only") << + R"(\\*emphasis*)" << + R"(\emphasis)"; + // https://spec.commonmark.org/0.31.2/#example-16 + QTest::newRow("backslash line break") << + "foo\\\nbar" << + "foo\u2029bar"; + // https://spec.commonmark.org/0.31.2/#example-17 + QTest::newRow("backslash in mono span") << + R"(`` \[\` ``)" << + R"(\[\`)"; + // https://spec.commonmark.org/0.31.2/#example-18 + QTest::newRow("backslash in indented code") << + R"( \[\])" << + R"(\[\])"; + // https://spec.commonmark.org/0.31.2/#example-19 + QTest::newRow("backslash in fenced code") << + "~~~\n\\[\\]\n~~~" << + R"(\[\])"; + // https://spec.commonmark.org/0.31.2/#example-20 + QTest::newRow("backslash in autolink") << + R"()" << + R"(https://example.com?find=\*)"; + // https://spec.commonmark.org/0.31.2/#example-21 + QTest::newRow("backslash in autolink") << + "1011011 [1011100 backslash \\" << "|1011011 [|1011100 backslash \\\\|"; + // https://spec.commonmark.org/0.31.2/#example-12 + // escaping punctuation is ok, but QTextMarkdownWriter currently doesn't do that (which is also ok) + QTest::newRow("punctuation") << + R"(

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~

)" << + R"(!"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~)"; + // https://spec.commonmark.org/0.31.2/#example-14 + QTest::newRow("backslash asterisk no emphasis") << // QTBUG-122083 + R"(\*no emphasis*)" << + R"(\\\*no emphasis*)"; + // https://spec.commonmark.org/0.31.2/#example-15 + QTest::newRow("backslash before emphasis") << + R"(\emphasis)" << + R"(\\*emphasis*)"; + // https://spec.commonmark.org/0.31.2/#example-20 + QTest::newRow("backslash-asterisk in autolink") << + R"(

https://example.com?find=\*

)" << + R"()"; + // https://spec.commonmark.org/0.31.2/#example-24 + QTest::newRow("plus in fenced code lang") << + "
foo
" << + "```foo+bar\nfoo\n```"; } void tst_QTextMarkdownWriter::fromHtml() @@ -850,12 +873,90 @@ void tst_QTextMarkdownWriter::fromHtml() QCOMPARE(output, expectedOutput); } +void tst_QTextMarkdownWriter::fromPlainTextAndBack_data() +{ + QTest::addColumn("input"); + QTest::addColumn("expectedMarkdown"); + + // tests to verify that fixing QTBUG-122083 is safe + QTest::newRow("single backslashes") << + R"(\ again: \ not esc: \* \-\-\ \*abc*)" << + R"(\\ again: \\ not esc: \\* \\-\\-\\ \\\*abc*)"; + // https://spec.commonmark.org/0.31.2/#example-12 + QTest::newRow("punctuation") << + R"(!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~)" << + R"(!"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~)"; + // https://spec.commonmark.org/0.31.2/#example-13 + QTest::newRow("literal backslashes") << + QString::fromUtf16(uR"(\→\A\a\ \3\φ\«)") << + "\\\\\u2192\\\\A\\\\a\\\\ \\\\3\\\\\u03C6\\\\\u00AB"; + // https://spec.commonmark.org/0.31.2/#example-14 + QTest::newRow("escape to avoid em") << + R"(*not emphasized*)" << + R"(\*not emphasized*)"; + QTest::newRow("escape to avoid html") << + R"(
not a tag)" << + R"(\
not a tag)"; + QTest::newRow("escape to avoid link") << + R"([not a link](/foo))" << + R"(\[not a link](/foo))"; + QTest::newRow("escape to avoid mono") << + R"(`not code`)" << + R"(\`not code`)"; + QTest::newRow("escape to avoid num list") << + R"(1. not a list)" << + R"(1\. not a list)"; + QTest::newRow("escape to avoid list") << + R"(* not a list)" << + R"(\* not a list)"; + QTest::newRow("escape to avoid heading") << + R"(# not a heading)" << + R"(\# not a heading)"; + QTest::newRow("escape to avoid reflink") << + R"([foo]: /url "not a reference")" << + R"(\[foo]: /url "not a reference")"; + QTest::newRow("escape to avoid entity") << + R"(ö not a character entity)" << + R"(\ö not a character entity)"; + // end of tests to verify that fixing QTBUG-122083 is safe + // (it's ok to add unrelated plain-to-markdown-to-plaintext cases later) +} + +void tst_QTextMarkdownWriter::fromPlainTextAndBack() +{ + QFETCH(QString, input); + QFETCH(QString, expectedMarkdown); + + document->setPlainText(input); + QString output = documentToUnixMarkdown(); + +#ifdef DEBUG_WRITE_OUTPUT + { + QFile out("/tmp/" + QLatin1String(QTest::currentDataTag()) + ".md"); + out.open(QFile::WriteOnly); + out.write(output.toUtf8()); + out.close(); + } +#endif + + output = output.trimmed(); + expectedMarkdown = expectedMarkdown.trimmed(); + if (output != expectedMarkdown && (isMainFontFixed() || isFixedFontProportional())) + QSKIP("", "fixed main font or proportional fixed font (QTBUG-103484)"); + QCOMPARE(output, expectedMarkdown); + QCOMPARE(document->toPlainText(), input); + document->setMarkdown(output); + QCOMPARE(document->toPlainText(), input); + if (document->blockCount() == 1) + QCOMPARE(document->firstBlock().text(), input); +} + void tst_QTextMarkdownWriter::escapeSpecialCharacters_data() { QTest::addColumn("input"); QTest::addColumn("expectedOutput"); - QTest::newRow("backslash") << "foo \\ bar \\\\ baz \\" << "foo \\\\ bar \\\\ baz \\\\"; + QTest::newRow("backslash") << "foo \\ bar \\\\ baz \\" << "foo \\\\ bar \\\\\\\\ baz \\\\"; QTest::newRow("not emphasized") << "*normal* **normal too**" << "\\*normal* \\**normal too**"; QTest::newRow("not code") << "`normal` `normal too`" << "\\`normal` \\`normal too`"; QTest::newRow("code fence") << "```not a fence; ``` no risk here; ```not a fence" // TODO slightly inconsistent