QXmlStreamWriter: decode UTF-8 into code points

We were iterating over code *units* and that yielded wrong results. The
one from the bug report was simply caused by the fact that
QUtf8StringView::value_type is char, which is signed on x86, so the
expression:
  *it <= u'\x1F'
was true for all non-Latin1 content.

But in attempting to fix this, I needed to do the proper UTF-8 decoding,
as otherwise we wouldn't catch non-Latin1 sequences and such.

[ChangeLog][QtCore][QXmlStreamWriter] Fixed a bug that caused the class
to fail to write UTF-8 strings with non-US-ASCII content when passed as
a QUtf8StringView.

Fixes: QTBUG-122241
Pick-to: 6.6 6.5
Change-Id: I83dda2d36c904517b3c0fffd17b42bbf09a493d0
Reviewed-by: Mate Barany <mate.barany@qt.io>
(cherry picked from commit 94c62e322264e2e7d61193ae74ba8556a330385c)
Reviewed-by: Qt Cherry-pick Bot <cherrypick_bot@qt-project.org>
This commit is contained in:
Thiago Macieira 2024-02-15 15:04:18 -08:00 committed by Qt Cherry-pick Bot
parent 5d1b211aba
commit 6bef40cb82
3 changed files with 170 additions and 10 deletions

View File

@ -2956,54 +2956,80 @@ void QXmlStreamWriterPrivate::write(QAnyStringView s)
void QXmlStreamWriterPrivate::writeEscaped(QAnyStringView s, bool escapeWhitespace)
{
struct NextLatin1 {
char32_t operator()(const char *&it, const char *) const
{ return uchar(*it++); }
};
struct NextUtf8 {
char32_t operator()(const char *&it, const char *end) const
{
uchar uc = *it++;
char32_t utf32 = 0;
char32_t *output = &utf32;
qsizetype n = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(uc, output, it, end);
return n < 0 ? 0 : utf32;
}
};
struct NextUtf16 {
char32_t operator()(const QChar *&it, const QChar *) const
{
return (it++)->unicode();
}
};
QString escaped;
escaped.reserve(s.size());
s.visit([&] (auto s) {
using View = decltype(s);
using Decoder = std::conditional_t<std::is_same_v<View, QLatin1StringView>, NextLatin1,
std::conditional_t<std::is_same_v<View, QUtf8StringView>, NextUtf8, NextUtf16>>;
auto it = s.begin();
const auto end = s.end();
Decoder decoder;
while (it != end) {
QLatin1StringView replacement;
auto mark = it;
while (it != end) {
if (*it == u'<') {
auto next_it = it;
char32_t uc = decoder(next_it, end);
if (uc == u'<') {
replacement = "&lt;"_L1;
break;
} else if (*it == u'>') {
} else if (uc == u'>') {
replacement = "&gt;"_L1;
break;
} else if (*it == u'&') {
} else if (uc == u'&') {
replacement = "&amp;"_L1;
break;
} else if (*it == u'\"') {
} else if (uc == u'\"') {
replacement = "&quot;"_L1;
break;
} else if (*it == u'\t') {
} else if (uc == u'\t') {
if (escapeWhitespace) {
replacement = "&#9;"_L1;
break;
}
} else if (*it == u'\n') {
} else if (uc == u'\n') {
if (escapeWhitespace) {
replacement = "&#10;"_L1;
break;
}
} else if (*it == u'\v' || *it == u'\f') {
} else if (uc == u'\v' || uc == u'\f') {
hasEncodingError = true;
break;
} else if (*it == u'\r') {
} else if (uc == u'\r') {
if (escapeWhitespace) {
replacement = "&#13;"_L1;
break;
}
} else if (*it <= u'\x1F' || *it >= u'\uFFFE') {
} else if (uc <= u'\x1F' || uc == u'\uFFFE' || uc == u'\uFFFF') {
hasEncodingError = true;
break;
}
++it;
it = next_it;
}
escaped.append(View{mark, it});

View File

@ -69,18 +69,27 @@ struct QUtf8BaseTraits
static void appendByte(qchar8_t *&ptr, qchar8_t b)
{ *ptr++ = b; }
static uchar peekByte(const char *ptr, qsizetype n = 0)
{ return ptr[n]; }
static uchar peekByte(const uchar *ptr, qsizetype n = 0)
{ return ptr[n]; }
static uchar peekByte(const qchar8_t *ptr, qsizetype n = 0)
{ return ptr[n]; }
static qptrdiff availableBytes(const char *ptr, const char *end)
{ return end - ptr; }
static qptrdiff availableBytes(const uchar *ptr, const uchar *end)
{ return end - ptr; }
static qptrdiff availableBytes(const qchar8_t *ptr, const qchar8_t *end)
{ return end - ptr; }
static void advanceByte(const char *&ptr, qsizetype n = 1)
{ ptr += n; }
static void advanceByte(const uchar *&ptr, qsizetype n = 1)
{ ptr += n; }

View File

@ -570,6 +570,12 @@ private slots:
void hasAttribute() const;
void writeWithUtf8Codec() const;
void writeWithStandalone() const;
void writeCharacters_data() const;
void writeCharacters() const;
void writeAttribute_data() const;
void writeAttribute() const;
void writeBadCharactersUtf8_data() const;
void writeBadCharactersUtf8() const;
void entitiesAndWhitespace_1() const;
void entitiesAndWhitespace_2() const;
void testFalsePrematureError() const;
@ -1380,6 +1386,125 @@ void tst_QXmlStream::writeWithStandalone() const
}
}
static void writeCharacters_data_common()
{
QTest::addColumn<QString>("input");
QTest::addColumn<QString>("output");
QTest::newRow("empty") << QString() << QString();
// invalid content
QTest::newRow("null-character") << u"\0"_s << QString();
QTest::newRow("vertical-tab") << "\v" << QString();
QTest::newRow("form-feed") << "\f" << QString();
QTest::newRow("esc") << "\x1f" << QString();
QTest::newRow("U+FFFE") << u"\xfffe"_s << QString();
QTest::newRow("U+FFFF") << u"\xffff"_s << QString();
// simple strings
QTest::newRow("us-ascii") << "Hello, world" << "Hello, world";
QTest::newRow("latin1") << "Bokmål" << "Bokmål";
QTest::newRow("nonlatin1") << "Ελληνικά" << "Ελληνικά";
QTest::newRow("nonbmp") << u"\U00010000"_s << u"\U00010000"_s;
// escaped content
QTest::newRow("less-than") << "<" << "&lt;";
QTest::newRow("greater-than") << ">" << "&gt;";
QTest::newRow("ampersand") << "&" << "&amp;";
QTest::newRow("quote") << "\"" << "&quot;";
}
template <typename Execute, typename Transform>
static void writeCharacters_common(Execute &&exec, Transform &&transform)
{
QFETCH(QString, input);
QFETCH(QString, output);
QStringView utf16 = input;
QByteArray utf8ba = input.toUtf8();
QUtf8StringView utf8(utf8ba);
// may be invalid if input is not Latin1
QByteArray l1ba = input.toLatin1();
QLatin1StringView l1(l1ba);
if (l1 != input)
l1 = {};
auto write = [&](auto input) -> std::optional<QString> {
QString result;
QXmlStreamWriter writer(&result);
writer.writeStartElement("a");
exec(writer, input);
writer.writeEndElement();
if (writer.hasError())
return std::nullopt;
return result;
};
if (input.isNull() != output.isNull()) {
// error
QCOMPARE(write(utf16), std::nullopt);
QCOMPARE(write(utf8), std::nullopt);
if (!l1.isEmpty())
QCOMPARE(write(l1), std::nullopt);
} else {
output = transform(output);
QCOMPARE(write(utf16), output);
QCOMPARE(write(utf8), output);
if (!l1.isEmpty())
QCOMPARE(write(l1), output);
}
}
void tst_QXmlStream::writeCharacters_data() const
{
writeCharacters_data_common();
QTest::newRow("tab") << "\t" << "\t";
QTest::newRow("newline") << "\n" << "\n";
QTest::newRow("carriage-return") << "\r" << "\r";
}
void tst_QXmlStream::writeCharacters() const
{
auto exec = [](QXmlStreamWriter &writer, auto input) {
writer.writeCharacters(input);
};
auto transform = [](auto output) { return "<a>" + output + "</a>"; };
writeCharacters_common(exec, transform);
}
void tst_QXmlStream::writeAttribute_data() const
{
writeCharacters_data_common();
QTest::newRow("tab") << "\t" << "&#9;";
QTest::newRow("newline") << "\n" << "&#10;";
QTest::newRow("carriage-return") << "\r" << "&#13;";
}
void tst_QXmlStream::writeAttribute() const
{
auto exec = [](QXmlStreamWriter &writer, auto input) {
writer.writeAttribute("b", input);
};
auto transform = [](auto output) { return "<a b=\"" + output + "\"/>"; };
writeCharacters_common(exec, transform);
}
#include "../../io/qurlinternal/utf8data.cpp"
void tst_QXmlStream::writeBadCharactersUtf8_data() const
{
QTest::addColumn<QByteArray>("input");
loadInvalidUtf8Rows();
}
void tst_QXmlStream::writeBadCharactersUtf8() const
{
QFETCH(QByteArray, input);
QString target;
QXmlStreamWriter writer(&target);
writer.writeTextElement("a", QUtf8StringView(input));
QVERIFY(writer.hasError());
}
void tst_QXmlStream::entitiesAndWhitespace_1() const
{
QXmlStreamReader reader(QLatin1String("<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN\" \"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd\"><test>&extEnt;</test>"));