QTextMarkdownWriter: escape all backslashes

A literal backslash needs to be doubled so that the parser doesn't treat
it as escaping the following character when the markdown is read back.
In ca4774131b9b8ee40b4d7f5c1ba296af4700207f we tried to limit it to
backslashes that were not already escaped. In case someone really needs
a longer series of backslashes, it's more correct to escape them all;
but this comes with the risk that if they do not get un-escaped by the
markdown parser in some scenario, repeated round-trip saving and loading
could multiply them excessively. So we also add a lot of tests to try
to verify that this is safe.

Task-number: QTBUG-96051
Fixes: QTBUG-122083
Change-Id: I64f610d24e99f67ebdc30d5ab5c6cf3985aec5ec
Reviewed-by: Axel Spoerl <axel.spoerl@qt.io>
(cherry picked from commit 0281005a711c3635114ba92f778d0e9c8a89027d)
Reviewed-by: Shawn Rutledge <shawn.rutledge@qt.io>
This commit is contained in:
Shawn Rutledge 2024-03-01 00:39:50 -07:00
parent 12c69ad04d
commit 662dbcb8a9
3 changed files with 208 additions and 10 deletions

View File

@ -301,25 +301,20 @@ static void maybeEscapeFirstChar(QString &s)
}
/*! \internal
Escape unescaped backslashes. Then escape any special character that stands
Escape all backslashes. Then escape any special character that stands
alone or prefixes a "word", including the \c < that starts an HTML tag.
https://spec.commonmark.org/0.31.2/#backslash-escapes
*/
static void escapeSpecialCharacters(QString &s)
{
static const QRegularExpression backslashRe(uR"([^\\]\\)"_s);
static const QRegularExpression spaceRe(uR"(\s+)"_s);
static const QRegularExpression specialRe(uR"([<!*[`&]+[/\w])"_s);
s.replace("\\"_L1, "\\\\"_L1);
int i = 0;
while (i >= 0) {
if (int j = s.indexOf(backslashRe, i); j >= 0) {
++j; // we found some char before the backslash that needs escaping
if (s.size() == j + 1 || s.at(j + 1) != qtmw_Backslash)
s.insert(j, qtmw_Backslash);
i = j + 3;
}
if (int j = s.indexOf(specialRe, i); j >= 0 && (j == 0 || s.at(j - 1) != u'\\')) {
if (int j = s.indexOf(specialRe, i); j >= 0) {
s.insert(j, qtmw_Backslash);
i = j + 3;
}

View File

@ -43,6 +43,8 @@ private slots:
void pathological();
void fencedCodeBlocks_data();
void fencedCodeBlocks();
void toRawText_data();
void toRawText();
private:
bool isMainFontFixed();
@ -536,6 +538,10 @@ void tst_QTextMarkdownImporter::fencedCodeBlocks_data()
<< "```pseudocode\nprint('hello world\\n')\n```\n"
<< 1 << 0 << "pseudocode" << "`"
<< "```pseudocode\nprint('hello world\\n')\n```\n\n";
QTest::newRow("backtick fence with punctuated language")
<< "```html+js\n<html><head><script>function hi() { console.log('\\\"hello world') }</script></head>blah</html>\n```\n"
<< 1 << 0 << "html+js" << "`"
<< "```html+js\n<html><head><script>function hi() { console.log('\\\"hello world') }</script></head>blah</html>\n```\n\n";
QTest::newRow("tilde fence with language")
<< "~~~pseudocode\nprint('hello world\\n')\n~~~\n"
<< 1 << 0 << "pseudocode" << "~"
@ -595,5 +601,101 @@ void tst_QTextMarkdownImporter::fencedCodeBlocks()
QCOMPARE(doc.toMarkdown(), rewrite);
}
void tst_QTextMarkdownImporter::toRawText_data()
{
QTest::addColumn<QString>("input");
QTest::addColumn<QString>("expectedRawText");
// tests to verify that fixing QTBUG-122083 is safe
// https://spec.commonmark.org/0.31.2/#example-12
QTest::newRow("punctuation backslash escapes") <<
R"(\!\"\#\$\%\&\'\(\)\*\+\,\-\.\/\:\;\<\=\>\?\@\[\\\]\^\_\`\{\|\}\~)" <<
R"(!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~)";
// https://spec.commonmark.org/0.31.2/#example-13
QTest::newRow("literal backslashes") <<
QString::fromUtf16(uR"(\\A\a\ \3\φ\«)") <<
QString::fromUtf16(uR"(\\A\a\ \3\φ\«)");
// https://spec.commonmark.org/0.31.2/#example-14
QTest::newRow("escape to avoid em") <<
R"(\*not emphasized*)" <<
R"(*not emphasized*)";
QTest::newRow("escape to avoid html") <<
R"(\<br/> not a tag)" <<
R"(<br/> not a tag)";
QTest::newRow("escape to avoid link") <<
R"(\[not a link](/foo))" <<
R"([not a link](/foo))";
QTest::newRow("escape to avoid mono") <<
R"(\`not code`)" <<
R"(`not code`)";
QTest::newRow("escape to avoid num list") <<
R"(1\. not a list)" <<
R"(1. not a list)";
QTest::newRow("escape to avoid list") <<
R"(\* not a list)" <<
R"(* not a list)";
QTest::newRow("escape to avoid heading") <<
R"(\# not a heading)" <<
R"(# not a heading)";
QTest::newRow("escape to avoid reflink") <<
R"(\[foo]: /url "not a reference")" <<
R"([foo]: /url "not a reference")";
QTest::newRow("escape to avoid entity") <<
R"(\&ouml; not a character entity)" <<
R"(&ouml; not a character entity)";
// https://spec.commonmark.org/0.31.2/#example-15
QTest::newRow("escape backslash only") <<
R"(\\*emphasis*)" <<
R"(\emphasis)";
// https://spec.commonmark.org/0.31.2/#example-16
QTest::newRow("backslash line break") <<
"foo\\\nbar" <<
"foo\u2029bar";
// https://spec.commonmark.org/0.31.2/#example-17
QTest::newRow("backslash in mono span") <<
R"(`` \[\` ``)" <<
R"(\[\`)";
// https://spec.commonmark.org/0.31.2/#example-18
QTest::newRow("backslash in indented code") <<
R"( \[\])" <<
R"(\[\])";
// https://spec.commonmark.org/0.31.2/#example-19
QTest::newRow("backslash in fenced code") <<
"~~~\n\\[\\]\n~~~" <<
R"(\[\])";
// https://spec.commonmark.org/0.31.2/#example-20
QTest::newRow("backslash in autolink") <<
R"(<https://example.com?find=\*>)" <<
R"(https://example.com?find=\*)";
// https://spec.commonmark.org/0.31.2/#example-21
QTest::newRow("backslash in autolink") <<
"<a href=\"/bar\\/)\"" <<
"<a href=\"/bar/)\"";
// https://spec.commonmark.org/0.31.2/#example-22
QTest::newRow("escapes in link") <<
R"([foo](/bar\* "ti\*tle"))" <<
R"(foo)";
// https://spec.commonmark.org/0.31.2/#example-24
QTest::newRow("backslash in code lang") <<
"```\nfoo\\+bar\nfoo\n```" <<
"foo\\+bar\u2029foo";
// end of tests to verify that fixing QTBUG-122083 is safe
// (it's ok to add unrelated markdown-to-rawtext cases later)
}
void tst_QTextMarkdownImporter::toRawText()
{
QFETCH(QString, input);
QFETCH(QString, expectedRawText);
QTextDocument doc;
doc.setMarkdown(input);
// These are testing md4c more than Qt, so any change may be an md4c bug, or a fix
QCOMPARE(doc.toRawText(), expectedRawText);
if (doc.blockCount() == 1)
QCOMPARE(doc.firstBlock().text(), expectedRawText);
}
QTEST_MAIN(tst_QTextMarkdownImporter)
#include "tst_qtextmarkdownimporter.moc"

View File

@ -44,6 +44,8 @@ private slots:
void rewriteDocument();
void fromHtml_data();
void fromHtml();
void fromPlainTextAndBack_data();
void fromPlainTextAndBack();
void escapeSpecialCharacters_data();
void escapeSpecialCharacters();
@ -824,6 +826,27 @@ void tst_QTextMarkdownWriter::fromHtml_data()
QTest::newRow("table with backslash in cell") << // QTBUG-96051
"<table><tr><td>1011011 [</td><td>1011100 backslash \\</td></tr></table>" <<
"|1011011 [|1011100 backslash \\\\|";
// https://spec.commonmark.org/0.31.2/#example-12
// escaping punctuation is ok, but QTextMarkdownWriter currently doesn't do that (which is also ok)
QTest::newRow("punctuation") <<
R"(<p>!&quot;#$%&amp;'()*+,-./:;&lt;=&gt;?@[\]^_`{|}~</p>)" <<
R"(!"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~)";
// https://spec.commonmark.org/0.31.2/#example-14
QTest::newRow("backslash asterisk no emphasis") << // QTBUG-122083
R"(\*no emphasis*)" <<
R"(\\\*no emphasis*)";
// https://spec.commonmark.org/0.31.2/#example-15
QTest::newRow("backslash before emphasis") <<
R"(\<em>emphasis</em>)" <<
R"(\\*emphasis*)";
// https://spec.commonmark.org/0.31.2/#example-20
QTest::newRow("backslash-asterisk in autolink") <<
R"(<p><a href="https://example.com?find=\\*">https://example.com?find=\*</a></p>)" <<
R"(<https://example.com?find=\\*>)";
// https://spec.commonmark.org/0.31.2/#example-24
QTest::newRow("plus in fenced code lang") <<
"<pre class=\"language-foo+bar\">foo</pre>" <<
"```foo+bar\nfoo\n```";
}
void tst_QTextMarkdownWriter::fromHtml()
@ -850,12 +873,90 @@ void tst_QTextMarkdownWriter::fromHtml()
QCOMPARE(output, expectedOutput);
}
void tst_QTextMarkdownWriter::fromPlainTextAndBack_data()
{
QTest::addColumn<QString>("input");
QTest::addColumn<QString>("expectedMarkdown");
// tests to verify that fixing QTBUG-122083 is safe
QTest::newRow("single backslashes") <<
R"(\ again: \ not esc: \* \-\-\ \*abc*)" <<
R"(\\ again: \\ not esc: \\* \\-\\-\\ \\\*abc*)";
// https://spec.commonmark.org/0.31.2/#example-12
QTest::newRow("punctuation") <<
R"(!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~)" <<
R"(!"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~)";
// https://spec.commonmark.org/0.31.2/#example-13
QTest::newRow("literal backslashes") <<
QString::fromUtf16(uR"(\\A\a\ \3\φ\«)") <<
"\\\\\u2192\\\\A\\\\a\\\\ \\\\3\\\\\u03C6\\\\\u00AB";
// https://spec.commonmark.org/0.31.2/#example-14
QTest::newRow("escape to avoid em") <<
R"(*not emphasized*)" <<
R"(\*not emphasized*)";
QTest::newRow("escape to avoid html") <<
R"(<br/> not a tag)" <<
R"(\<br/> not a tag)";
QTest::newRow("escape to avoid link") <<
R"([not a link](/foo))" <<
R"(\[not a link](/foo))";
QTest::newRow("escape to avoid mono") <<
R"(`not code`)" <<
R"(\`not code`)";
QTest::newRow("escape to avoid num list") <<
R"(1. not a list)" <<
R"(1\. not a list)";
QTest::newRow("escape to avoid list") <<
R"(* not a list)" <<
R"(\* not a list)";
QTest::newRow("escape to avoid heading") <<
R"(# not a heading)" <<
R"(\# not a heading)";
QTest::newRow("escape to avoid reflink") <<
R"([foo]: /url "not a reference")" <<
R"(\[foo]: /url "not a reference")";
QTest::newRow("escape to avoid entity") <<
R"(&ouml; not a character entity)" <<
R"(\&ouml; not a character entity)";
// end of tests to verify that fixing QTBUG-122083 is safe
// (it's ok to add unrelated plain-to-markdown-to-plaintext cases later)
}
void tst_QTextMarkdownWriter::fromPlainTextAndBack()
{
QFETCH(QString, input);
QFETCH(QString, expectedMarkdown);
document->setPlainText(input);
QString output = documentToUnixMarkdown();
#ifdef DEBUG_WRITE_OUTPUT
{
QFile out("/tmp/" + QLatin1String(QTest::currentDataTag()) + ".md");
out.open(QFile::WriteOnly);
out.write(output.toUtf8());
out.close();
}
#endif
output = output.trimmed();
expectedMarkdown = expectedMarkdown.trimmed();
if (output != expectedMarkdown && (isMainFontFixed() || isFixedFontProportional()))
QSKIP("", "fixed main font or proportional fixed font (QTBUG-103484)");
QCOMPARE(output, expectedMarkdown);
QCOMPARE(document->toPlainText(), input);
document->setMarkdown(output);
QCOMPARE(document->toPlainText(), input);
if (document->blockCount() == 1)
QCOMPARE(document->firstBlock().text(), input);
}
void tst_QTextMarkdownWriter::escapeSpecialCharacters_data()
{
QTest::addColumn<QString>("input");
QTest::addColumn<QString>("expectedOutput");
QTest::newRow("backslash") << "foo \\ bar \\\\ baz \\" << "foo \\\\ bar \\\\ baz \\\\";
QTest::newRow("backslash") << "foo \\ bar \\\\ baz \\" << "foo \\\\ bar \\\\\\\\ baz \\\\";
QTest::newRow("not emphasized") << "*normal* **normal too**" << "\\*normal* \\**normal too**";
QTest::newRow("not code") << "`normal` `normal too`" << "\\`normal` \\`normal too`";
QTest::newRow("code fence") << "```not a fence; ``` no risk here; ```not a fence" // TODO slightly inconsistent