QXmlStreamWriter: decode UTF-8 into code points
We were iterating over code *units* and that yielded wrong results. The one from the bug report was simply caused by the fact that QUtf8StringView::value_type is char, which is signed on x86, so the expression: *it <= u'\x1F' was true for all non-Latin1 content. But in attempting to fix this, I needed to do the proper UTF-8 decoding, as otherwise we wouldn't catch non-Latin1 sequences and such. [ChangeLog][QtCore][QXmlStreamWriter] Fixed a bug that caused the class to fail to write UTF-8 strings with non-US-ASCII content when passed as a QUtf8StringView. Fixes: QTBUG-122241 Pick-to: 6.6 6.5 Change-Id: I83dda2d36c904517b3c0fffd17b42bbf09a493d0 Reviewed-by: Mate Barany <mate.barany@qt.io> (cherry picked from commit 94c62e322264e2e7d61193ae74ba8556a330385c) Reviewed-by: Qt Cherry-pick Bot <cherrypick_bot@qt-project.org>
This commit is contained in:
parent
5d1b211aba
commit
6bef40cb82
@ -2956,54 +2956,80 @@ void QXmlStreamWriterPrivate::write(QAnyStringView s)
|
||||
|
||||
void QXmlStreamWriterPrivate::writeEscaped(QAnyStringView s, bool escapeWhitespace)
|
||||
{
|
||||
struct NextLatin1 {
|
||||
char32_t operator()(const char *&it, const char *) const
|
||||
{ return uchar(*it++); }
|
||||
};
|
||||
struct NextUtf8 {
|
||||
char32_t operator()(const char *&it, const char *end) const
|
||||
{
|
||||
uchar uc = *it++;
|
||||
char32_t utf32 = 0;
|
||||
char32_t *output = &utf32;
|
||||
qsizetype n = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(uc, output, it, end);
|
||||
return n < 0 ? 0 : utf32;
|
||||
}
|
||||
};
|
||||
struct NextUtf16 {
|
||||
char32_t operator()(const QChar *&it, const QChar *) const
|
||||
{
|
||||
return (it++)->unicode();
|
||||
}
|
||||
};
|
||||
|
||||
QString escaped;
|
||||
escaped.reserve(s.size());
|
||||
s.visit([&] (auto s) {
|
||||
using View = decltype(s);
|
||||
using Decoder = std::conditional_t<std::is_same_v<View, QLatin1StringView>, NextLatin1,
|
||||
std::conditional_t<std::is_same_v<View, QUtf8StringView>, NextUtf8, NextUtf16>>;
|
||||
|
||||
auto it = s.begin();
|
||||
const auto end = s.end();
|
||||
Decoder decoder;
|
||||
|
||||
while (it != end) {
|
||||
QLatin1StringView replacement;
|
||||
auto mark = it;
|
||||
|
||||
while (it != end) {
|
||||
if (*it == u'<') {
|
||||
auto next_it = it;
|
||||
char32_t uc = decoder(next_it, end);
|
||||
if (uc == u'<') {
|
||||
replacement = "<"_L1;
|
||||
break;
|
||||
} else if (*it == u'>') {
|
||||
} else if (uc == u'>') {
|
||||
replacement = ">"_L1;
|
||||
break;
|
||||
} else if (*it == u'&') {
|
||||
} else if (uc == u'&') {
|
||||
replacement = "&"_L1;
|
||||
break;
|
||||
} else if (*it == u'\"') {
|
||||
} else if (uc == u'\"') {
|
||||
replacement = """_L1;
|
||||
break;
|
||||
} else if (*it == u'\t') {
|
||||
} else if (uc == u'\t') {
|
||||
if (escapeWhitespace) {
|
||||
replacement = "	"_L1;
|
||||
break;
|
||||
}
|
||||
} else if (*it == u'\n') {
|
||||
} else if (uc == u'\n') {
|
||||
if (escapeWhitespace) {
|
||||
replacement = " "_L1;
|
||||
break;
|
||||
}
|
||||
} else if (*it == u'\v' || *it == u'\f') {
|
||||
} else if (uc == u'\v' || uc == u'\f') {
|
||||
hasEncodingError = true;
|
||||
break;
|
||||
} else if (*it == u'\r') {
|
||||
} else if (uc == u'\r') {
|
||||
if (escapeWhitespace) {
|
||||
replacement = " "_L1;
|
||||
break;
|
||||
}
|
||||
} else if (*it <= u'\x1F' || *it >= u'\uFFFE') {
|
||||
} else if (uc <= u'\x1F' || uc == u'\uFFFE' || uc == u'\uFFFF') {
|
||||
hasEncodingError = true;
|
||||
break;
|
||||
}
|
||||
++it;
|
||||
it = next_it;
|
||||
}
|
||||
|
||||
escaped.append(View{mark, it});
|
||||
|
@ -69,18 +69,27 @@ struct QUtf8BaseTraits
|
||||
static void appendByte(qchar8_t *&ptr, qchar8_t b)
|
||||
{ *ptr++ = b; }
|
||||
|
||||
static uchar peekByte(const char *ptr, qsizetype n = 0)
|
||||
{ return ptr[n]; }
|
||||
|
||||
static uchar peekByte(const uchar *ptr, qsizetype n = 0)
|
||||
{ return ptr[n]; }
|
||||
|
||||
static uchar peekByte(const qchar8_t *ptr, qsizetype n = 0)
|
||||
{ return ptr[n]; }
|
||||
|
||||
static qptrdiff availableBytes(const char *ptr, const char *end)
|
||||
{ return end - ptr; }
|
||||
|
||||
static qptrdiff availableBytes(const uchar *ptr, const uchar *end)
|
||||
{ return end - ptr; }
|
||||
|
||||
static qptrdiff availableBytes(const qchar8_t *ptr, const qchar8_t *end)
|
||||
{ return end - ptr; }
|
||||
|
||||
static void advanceByte(const char *&ptr, qsizetype n = 1)
|
||||
{ ptr += n; }
|
||||
|
||||
static void advanceByte(const uchar *&ptr, qsizetype n = 1)
|
||||
{ ptr += n; }
|
||||
|
||||
|
@ -570,6 +570,12 @@ private slots:
|
||||
void hasAttribute() const;
|
||||
void writeWithUtf8Codec() const;
|
||||
void writeWithStandalone() const;
|
||||
void writeCharacters_data() const;
|
||||
void writeCharacters() const;
|
||||
void writeAttribute_data() const;
|
||||
void writeAttribute() const;
|
||||
void writeBadCharactersUtf8_data() const;
|
||||
void writeBadCharactersUtf8() const;
|
||||
void entitiesAndWhitespace_1() const;
|
||||
void entitiesAndWhitespace_2() const;
|
||||
void testFalsePrematureError() const;
|
||||
@ -1380,6 +1386,125 @@ void tst_QXmlStream::writeWithStandalone() const
|
||||
}
|
||||
}
|
||||
|
||||
static void writeCharacters_data_common()
|
||||
{
|
||||
QTest::addColumn<QString>("input");
|
||||
QTest::addColumn<QString>("output");
|
||||
|
||||
QTest::newRow("empty") << QString() << QString();
|
||||
|
||||
// invalid content
|
||||
QTest::newRow("null-character") << u"\0"_s << QString();
|
||||
QTest::newRow("vertical-tab") << "\v" << QString();
|
||||
QTest::newRow("form-feed") << "\f" << QString();
|
||||
QTest::newRow("esc") << "\x1f" << QString();
|
||||
QTest::newRow("U+FFFE") << u"\xfffe"_s << QString();
|
||||
QTest::newRow("U+FFFF") << u"\xffff"_s << QString();
|
||||
|
||||
// simple strings
|
||||
QTest::newRow("us-ascii") << "Hello, world" << "Hello, world";
|
||||
QTest::newRow("latin1") << "Bokmål" << "Bokmål";
|
||||
QTest::newRow("nonlatin1") << "Ελληνικά" << "Ελληνικά";
|
||||
QTest::newRow("nonbmp") << u"\U00010000"_s << u"\U00010000"_s;
|
||||
|
||||
// escaped content
|
||||
QTest::newRow("less-than") << "<" << "<";
|
||||
QTest::newRow("greater-than") << ">" << ">";
|
||||
QTest::newRow("ampersand") << "&" << "&";
|
||||
QTest::newRow("quote") << "\"" << """;
|
||||
}
|
||||
|
||||
template <typename Execute, typename Transform>
|
||||
static void writeCharacters_common(Execute &&exec, Transform &&transform)
|
||||
{
|
||||
QFETCH(QString, input);
|
||||
QFETCH(QString, output);
|
||||
QStringView utf16 = input;
|
||||
QByteArray utf8ba = input.toUtf8();
|
||||
QUtf8StringView utf8(utf8ba);
|
||||
|
||||
// may be invalid if input is not Latin1
|
||||
QByteArray l1ba = input.toLatin1();
|
||||
QLatin1StringView l1(l1ba);
|
||||
if (l1 != input)
|
||||
l1 = {};
|
||||
|
||||
auto write = [&](auto input) -> std::optional<QString> {
|
||||
QString result;
|
||||
QXmlStreamWriter writer(&result);
|
||||
writer.writeStartElement("a");
|
||||
exec(writer, input);
|
||||
writer.writeEndElement();
|
||||
if (writer.hasError())
|
||||
return std::nullopt;
|
||||
return result;
|
||||
};
|
||||
|
||||
if (input.isNull() != output.isNull()) {
|
||||
// error
|
||||
QCOMPARE(write(utf16), std::nullopt);
|
||||
QCOMPARE(write(utf8), std::nullopt);
|
||||
if (!l1.isEmpty())
|
||||
QCOMPARE(write(l1), std::nullopt);
|
||||
} else {
|
||||
output = transform(output);
|
||||
QCOMPARE(write(utf16), output);
|
||||
QCOMPARE(write(utf8), output);
|
||||
if (!l1.isEmpty())
|
||||
QCOMPARE(write(l1), output);
|
||||
}
|
||||
}
|
||||
|
||||
void tst_QXmlStream::writeCharacters_data() const
|
||||
{
|
||||
writeCharacters_data_common();
|
||||
QTest::newRow("tab") << "\t" << "\t";
|
||||
QTest::newRow("newline") << "\n" << "\n";
|
||||
QTest::newRow("carriage-return") << "\r" << "\r";
|
||||
}
|
||||
|
||||
void tst_QXmlStream::writeCharacters() const
|
||||
{
|
||||
auto exec = [](QXmlStreamWriter &writer, auto input) {
|
||||
writer.writeCharacters(input);
|
||||
};
|
||||
auto transform = [](auto output) { return "<a>" + output + "</a>"; };
|
||||
writeCharacters_common(exec, transform);
|
||||
}
|
||||
|
||||
void tst_QXmlStream::writeAttribute_data() const
|
||||
{
|
||||
writeCharacters_data_common();
|
||||
QTest::newRow("tab") << "\t" << "	";
|
||||
QTest::newRow("newline") << "\n" << " ";
|
||||
QTest::newRow("carriage-return") << "\r" << " ";
|
||||
}
|
||||
|
||||
void tst_QXmlStream::writeAttribute() const
|
||||
{
|
||||
auto exec = [](QXmlStreamWriter &writer, auto input) {
|
||||
writer.writeAttribute("b", input);
|
||||
};
|
||||
auto transform = [](auto output) { return "<a b=\"" + output + "\"/>"; };
|
||||
writeCharacters_common(exec, transform);
|
||||
}
|
||||
|
||||
#include "../../io/qurlinternal/utf8data.cpp"
|
||||
void tst_QXmlStream::writeBadCharactersUtf8_data() const
|
||||
{
|
||||
QTest::addColumn<QByteArray>("input");
|
||||
loadInvalidUtf8Rows();
|
||||
}
|
||||
|
||||
void tst_QXmlStream::writeBadCharactersUtf8() const
|
||||
{
|
||||
QFETCH(QByteArray, input);
|
||||
QString target;
|
||||
QXmlStreamWriter writer(&target);
|
||||
writer.writeTextElement("a", QUtf8StringView(input));
|
||||
QVERIFY(writer.hasError());
|
||||
}
|
||||
|
||||
void tst_QXmlStream::entitiesAndWhitespace_1() const
|
||||
{
|
||||
QXmlStreamReader reader(QLatin1String("<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN\" \"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd\"><test>&extEnt;</test>"));
|
||||
|
Loading…
x
Reference in New Issue
Block a user