Extract and re-write "front matter" in markdown documents

It's increasingly common for YAML to be used as metadata in front of
markdown documents. md4c does not handle this, so we need to remove
it ahead of time, lest md4c misinterpret it as heading text or so.

The --- fences are expected to be consistent regardless of the format of
what's between them, and the yaml (or whatever) parser does not need to
see them. So we remove them while reading, and QTextMarkdownWriter
writes them around the front matter if there is any.

If your application needs to parse this "front matter", just call
qtd->metaInformation(QTextDocument::FrontMatter).toUtf8() and feed that
to some parser that you've linked in, such as yaml-cpp.

Since YAML is used with GitHub Docs, we consider this feature to be part
of the GitHub dialect:
https://docs.github.com/en/contributing/writing-for-github-docs/using-yaml-frontmatter

[ChangeLog][QtGui][Text] Markdown "front matter" (usually YAML) is now
extracted during parsing (GitHub dialect) and can be retrieved from
QTextDocument::metaInformation(FrontMatter). QTextMarkdownWriter also
writes front matter (if any) to the output.

Fixes: QTBUG-120722
Change-Id: I220ddcd2b94c99453853643516ca7a36bb2bcd6f
Reviewed-by: Axel Spoerl <axel.spoerl@qt.io>
This commit is contained in:
Shawn Rutledge 2024-01-10 12:45:33 -07:00
parent 216af5d7f9
commit bffddc6a99
11 changed files with 102 additions and 3 deletions

View File

@ -1159,6 +1159,8 @@ QString QTextDocument::metaInformation(MetaInformation info) const
return d->url;
case CssMedia:
return d->cssMedia;
case FrontMatter:
return d->frontMatter;
}
return QString();
}
@ -1182,6 +1184,9 @@ void QTextDocument::setMetaInformation(MetaInformation info, const QString &stri
case CssMedia:
d->cssMedia = string;
break;
case FrontMatter:
d->frontMatter = string;
break;
}
}
@ -1327,6 +1332,10 @@ void QTextDocument::setHtml(const QString &html)
\value CssMedia This value is used to select the corresponding '@media'
rule, if any, from a specified CSS stylesheet when setHtml()
is called. This enum value has been introduced in Qt 6.3.
\value FrontMatter This value is used to select header material, if any was
extracted during parsing of the source file (currently
only from Markdown format). This enum value has been
introduced in Qt 6.8.
\sa metaInformation(), setMetaInformation(), setHtml()
*/

View File

@ -105,7 +105,8 @@ public:
enum MetaInformation {
DocumentTitle,
DocumentUrl,
CssMedia
CssMedia,
FrontMatter,
};
void setMetaInformation(MetaInformation info, const QString &);
QString metaInformation(MetaInformation info) const;
@ -119,7 +120,7 @@ public:
enum MarkdownFeature {
MarkdownNoHTML = 0x0020 | 0x0040,
MarkdownDialectCommonMark = 0,
MarkdownDialectGitHub = 0x0004 | 0x0008 | 0x0400 | 0x0100 | 0x0200 | 0x0800 | 0x4000
MarkdownDialectGitHub = 0x0004 | 0x0008 | 0x0400 | 0x0100 | 0x0200 | 0x0800 | 0x4000 | 0x100000
};
Q_DECLARE_FLAGS(MarkdownFeatures, MarkdownFeature)
Q_FLAG(MarkdownFeatures)

View File

@ -356,6 +356,7 @@ public:
QString title;
QString url;
QString cssMedia;
QString frontMatter;
qreal indentWidth;
qreal documentMargin;
QUrl baseUrl;

View File

@ -46,7 +46,8 @@ static_assert(int(QTextMarkdownImporter::FeaturePermissiveAutoLinks) == MD_FLAG_
static_assert(int(QTextMarkdownImporter::FeatureTasklists) == MD_FLAG_TASKLISTS);
static_assert(int(QTextMarkdownImporter::FeatureNoHTML) == MD_FLAG_NOHTML);
static_assert(int(QTextMarkdownImporter::DialectCommonMark) == MD_DIALECT_COMMONMARK);
static_assert(int(QTextMarkdownImporter::DialectGitHub) == (MD_DIALECT_GITHUB | MD_FLAG_UNDERLINE));
static_assert(int(QTextMarkdownImporter::DialectGitHub) ==
(MD_DIALECT_GITHUB | MD_FLAG_UNDERLINE | QTextMarkdownImporter::FeatureFrontMatter));
// --------------------------------------------------------
// MD4C callback function wrappers
@ -139,6 +140,21 @@ void QTextMarkdownImporter::import(const QString &markdown)
m_monoFont.setPixelSize(defaultFont.pixelSize());
qCDebug(lcMD) << "default font" << defaultFont << "mono font" << m_monoFont;
QByteArray md = markdown.toUtf8();
if (md.startsWith("---") && m_features.testFlag(QTextMarkdownImporter::FeatureFrontMatter)) {
qsizetype endMarkerPos = md.indexOf("---", 4);
if (endMarkerPos > 4) {
qsizetype firstLinePos = 4; // first line of yaml
while (md.at(firstLinePos) == '\n' || md.at(firstLinePos) == '\r')
++firstLinePos;
QByteArray frontMatter = md.sliced(firstLinePos, endMarkerPos - firstLinePos);
firstLinePos = endMarkerPos + 4; // first line of markdown after yaml
while (md.at(firstLinePos) == '\n' || md.at(firstLinePos) == '\r')
++firstLinePos;
md.remove(0, firstLinePos);
doc->setMetaInformation(QTextDocument::FrontMatter, QString::fromUtf8(frontMatter));
qCDebug(lcMD) << "extracted FrontMatter: size" << frontMatter.size();
}
}
m_cursor.beginEditBlock();
md_parse(md.constData(), MD_SIZE(md.size()), &callbacks, this);
m_cursor.endEditBlock();

View File

@ -46,6 +46,7 @@ public:
FeaturePermissiveWWWAutoLinks = 0x0400,
FeatureTasklists = 0x0800,
FeatureUnderline = 0x4000,
FeatureFrontMatter = 0x100000, // Qt feature, not yet in MD4C
// composite flags
FeaturePermissiveAutoLinks = FeaturePermissiveMailAutoLinks
| FeaturePermissiveURLAutoLinks | FeaturePermissiveWWWAutoLinks,

View File

@ -10,6 +10,7 @@
#include "qtexttable.h"
#include "qtextcursor.h"
#include "qtextimagehandler_p.h"
#include "qtextmarkdownimporter_p.h"
#include "qloggingcategory.h"
#if QT_CONFIG(itemmodel)
#include "qabstractitemmodel.h"
@ -38,6 +39,7 @@ QTextMarkdownWriter::QTextMarkdownWriter(QTextStream &stream, QTextDocument::Mar
bool QTextMarkdownWriter::writeAll(const QTextDocument *document)
{
writeFrontMatter(document->metaInformation(QTextDocument::FrontMatter));
writeFrame(document->rootFrame());
return true;
}
@ -76,6 +78,17 @@ void QTextMarkdownWriter::writeTable(const QAbstractItemModel *table)
}
#endif
void QTextMarkdownWriter::writeFrontMatter(const QString &fm)
{
if (fm.isEmpty() || !m_features.testFlag(static_cast<QTextDocument::MarkdownFeature>(
QTextMarkdownImporter::FeatureFrontMatter)))
return;
m_stream << "---\n"_L1 << fm;
if (!fm.endsWith(qtmw_Newline))
m_stream << qtmw_Newline;
m_stream << "---\n"_L1;
}
void QTextMarkdownWriter::writeFrame(const QTextFrame *frame)
{
Q_ASSERT(frame);

View File

@ -36,6 +36,7 @@ public:
int writeBlock(const QTextBlock &block, bool table, bool ignoreFormat, bool ignoreEmpty);
void writeFrame(const QTextFrame *frame);
void writeFrontMatter(const QString &fm);
private:
struct ListInfo {

View File

@ -0,0 +1,11 @@
---
name: "Venus"
discoverer: "Galileo Galilei"
title: "A description of the planet Venus"
keywords:
- planets
- solar system
- astronomy
---
*Venus* is the second planet from the Sun, orbiting it every 224.7 Earth days.

View File

@ -43,6 +43,7 @@ private slots:
void pathological();
void fencedCodeBlocks_data();
void fencedCodeBlocks();
void frontMatter();
private:
bool isMainFontFixed();
@ -595,5 +596,27 @@ void tst_QTextMarkdownImporter::fencedCodeBlocks()
QCOMPARE(doc.toMarkdown(), rewrite);
}
void tst_QTextMarkdownImporter::frontMatter()
{
QFile f(QFINDTESTDATA("data/yaml.md"));
QVERIFY(f.open(QFile::ReadOnly | QIODevice::Text));
QString md = QString::fromUtf8(f.readAll());
f.close();
const int yamlBegin = md.indexOf("name:");
const int yamlEnd = md.indexOf("---", yamlBegin);
const QString yaml = md.sliced(yamlBegin, yamlEnd - yamlBegin);
QTextDocument doc;
QTextMarkdownImporter(&doc, QTextMarkdownImporter::DialectGitHub).import(md);
int blockCount = 0;
for (QTextFrame::iterator iterator = doc.rootFrame()->begin(); !iterator.atEnd(); ++iterator) {
// Check whether the block is text or a horizontal rule
if (!iterator.currentBlock().text().isEmpty())
++blockCount;
}
QCOMPARE(blockCount, 1); // yaml is not part of the markdown text
QCOMPARE(doc.metaInformation(QTextDocument::FrontMatter), yaml); // without fences
}
QTEST_MAIN(tst_QTextMarkdownImporter)
#include "tst_qtextmarkdownimporter.moc"

View File

@ -0,0 +1,11 @@
---
name: "Venus"
discoverer: "Galileo Galilei"
title: "A description of the planet Venus"
keywords:
- planets
- solar system
- astronomy
---
*Venus* is the second planet from the Sun, orbiting it every 224.7 Earth days.

View File

@ -36,6 +36,7 @@ private slots:
void testWriteNestedNumericLists();
void testWriteNumericListWithStart();
void testWriteTable();
void frontMatter();
void rewriteDocument_data();
void rewriteDocument();
void fromHtml_data();
@ -525,6 +526,16 @@ void tst_QTextMarkdownWriter::testWriteTable()
QCOMPARE(md, expected);
}
void tst_QTextMarkdownWriter::frontMatter()
{
QTextCursor cursor(document);
cursor.insertText("bar");
document->setMetaInformation(QTextDocument::FrontMatter, "foo");
const QString output = documentToUnixMarkdown();
QCOMPARE(output, "---\nfoo\n---\nbar\n\n");
}
void tst_QTextMarkdownWriter::rewriteDocument_data()
{
QTest::addColumn<QString>("inputFile");
@ -535,6 +546,7 @@ void tst_QTextMarkdownWriter::rewriteDocument_data()
QTest::newRow("word wrap") << "wordWrap.md";
QTest::newRow("links") << "links.md";
QTest::newRow("lists and code blocks") << "listsAndCodeBlocks.md";
QTest::newRow("front matter") << "yaml.md";
}
void tst_QTextMarkdownWriter::rewriteDocument()