QXmlStreamWriter: decode UTF-8 into code points
We were iterating over code *units* and that yielded wrong results. The one from the bug report was simply caused by the fact that QUtf8StringView::value_type is char, which is signed on x86, so the expression: *it <= u'\x1F' was true for all non-Latin1 content. But in attempting to fix this, I needed to do the proper UTF-8 decoding, as otherwise we wouldn't catch non-Latin1 sequences and such. [ChangeLog][QtCore][QXmlStreamWriter] Fixed a bug that caused the class to fail to write UTF-8 strings with non-US-ASCII content when passed as a QUtf8StringView. Fixes: QTBUG-122241 Pick-to: 6.6 6.5 Change-Id: I83dda2d36c904517b3c0fffd17b42bbf09a493d0 Reviewed-by: Mate Barany <mate.barany@qt.io> (cherry picked from commit 94c62e322264e2e7d61193ae74ba8556a330385c) Reviewed-by: Qt Cherry-pick Bot <cherrypick_bot@qt-project.org>
This commit is contained in:
parent
5d1b211aba
commit
6bef40cb82
@ -2956,54 +2956,80 @@ void QXmlStreamWriterPrivate::write(QAnyStringView s)
|
|||||||
|
|
||||||
void QXmlStreamWriterPrivate::writeEscaped(QAnyStringView s, bool escapeWhitespace)
|
void QXmlStreamWriterPrivate::writeEscaped(QAnyStringView s, bool escapeWhitespace)
|
||||||
{
|
{
|
||||||
|
struct NextLatin1 {
|
||||||
|
char32_t operator()(const char *&it, const char *) const
|
||||||
|
{ return uchar(*it++); }
|
||||||
|
};
|
||||||
|
struct NextUtf8 {
|
||||||
|
char32_t operator()(const char *&it, const char *end) const
|
||||||
|
{
|
||||||
|
uchar uc = *it++;
|
||||||
|
char32_t utf32 = 0;
|
||||||
|
char32_t *output = &utf32;
|
||||||
|
qsizetype n = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(uc, output, it, end);
|
||||||
|
return n < 0 ? 0 : utf32;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
struct NextUtf16 {
|
||||||
|
char32_t operator()(const QChar *&it, const QChar *) const
|
||||||
|
{
|
||||||
|
return (it++)->unicode();
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
QString escaped;
|
QString escaped;
|
||||||
escaped.reserve(s.size());
|
escaped.reserve(s.size());
|
||||||
s.visit([&] (auto s) {
|
s.visit([&] (auto s) {
|
||||||
using View = decltype(s);
|
using View = decltype(s);
|
||||||
|
using Decoder = std::conditional_t<std::is_same_v<View, QLatin1StringView>, NextLatin1,
|
||||||
|
std::conditional_t<std::is_same_v<View, QUtf8StringView>, NextUtf8, NextUtf16>>;
|
||||||
|
|
||||||
auto it = s.begin();
|
auto it = s.begin();
|
||||||
const auto end = s.end();
|
const auto end = s.end();
|
||||||
|
Decoder decoder;
|
||||||
|
|
||||||
while (it != end) {
|
while (it != end) {
|
||||||
QLatin1StringView replacement;
|
QLatin1StringView replacement;
|
||||||
auto mark = it;
|
auto mark = it;
|
||||||
|
|
||||||
while (it != end) {
|
while (it != end) {
|
||||||
if (*it == u'<') {
|
auto next_it = it;
|
||||||
|
char32_t uc = decoder(next_it, end);
|
||||||
|
if (uc == u'<') {
|
||||||
replacement = "<"_L1;
|
replacement = "<"_L1;
|
||||||
break;
|
break;
|
||||||
} else if (*it == u'>') {
|
} else if (uc == u'>') {
|
||||||
replacement = ">"_L1;
|
replacement = ">"_L1;
|
||||||
break;
|
break;
|
||||||
} else if (*it == u'&') {
|
} else if (uc == u'&') {
|
||||||
replacement = "&"_L1;
|
replacement = "&"_L1;
|
||||||
break;
|
break;
|
||||||
} else if (*it == u'\"') {
|
} else if (uc == u'\"') {
|
||||||
replacement = """_L1;
|
replacement = """_L1;
|
||||||
break;
|
break;
|
||||||
} else if (*it == u'\t') {
|
} else if (uc == u'\t') {
|
||||||
if (escapeWhitespace) {
|
if (escapeWhitespace) {
|
||||||
replacement = "	"_L1;
|
replacement = "	"_L1;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
} else if (*it == u'\n') {
|
} else if (uc == u'\n') {
|
||||||
if (escapeWhitespace) {
|
if (escapeWhitespace) {
|
||||||
replacement = " "_L1;
|
replacement = " "_L1;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
} else if (*it == u'\v' || *it == u'\f') {
|
} else if (uc == u'\v' || uc == u'\f') {
|
||||||
hasEncodingError = true;
|
hasEncodingError = true;
|
||||||
break;
|
break;
|
||||||
} else if (*it == u'\r') {
|
} else if (uc == u'\r') {
|
||||||
if (escapeWhitespace) {
|
if (escapeWhitespace) {
|
||||||
replacement = " "_L1;
|
replacement = " "_L1;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
} else if (*it <= u'\x1F' || *it >= u'\uFFFE') {
|
} else if (uc <= u'\x1F' || uc == u'\uFFFE' || uc == u'\uFFFF') {
|
||||||
hasEncodingError = true;
|
hasEncodingError = true;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
++it;
|
it = next_it;
|
||||||
}
|
}
|
||||||
|
|
||||||
escaped.append(View{mark, it});
|
escaped.append(View{mark, it});
|
||||||
|
@ -69,18 +69,27 @@ struct QUtf8BaseTraits
|
|||||||
static void appendByte(qchar8_t *&ptr, qchar8_t b)
|
static void appendByte(qchar8_t *&ptr, qchar8_t b)
|
||||||
{ *ptr++ = b; }
|
{ *ptr++ = b; }
|
||||||
|
|
||||||
|
static uchar peekByte(const char *ptr, qsizetype n = 0)
|
||||||
|
{ return ptr[n]; }
|
||||||
|
|
||||||
static uchar peekByte(const uchar *ptr, qsizetype n = 0)
|
static uchar peekByte(const uchar *ptr, qsizetype n = 0)
|
||||||
{ return ptr[n]; }
|
{ return ptr[n]; }
|
||||||
|
|
||||||
static uchar peekByte(const qchar8_t *ptr, qsizetype n = 0)
|
static uchar peekByte(const qchar8_t *ptr, qsizetype n = 0)
|
||||||
{ return ptr[n]; }
|
{ return ptr[n]; }
|
||||||
|
|
||||||
|
static qptrdiff availableBytes(const char *ptr, const char *end)
|
||||||
|
{ return end - ptr; }
|
||||||
|
|
||||||
static qptrdiff availableBytes(const uchar *ptr, const uchar *end)
|
static qptrdiff availableBytes(const uchar *ptr, const uchar *end)
|
||||||
{ return end - ptr; }
|
{ return end - ptr; }
|
||||||
|
|
||||||
static qptrdiff availableBytes(const qchar8_t *ptr, const qchar8_t *end)
|
static qptrdiff availableBytes(const qchar8_t *ptr, const qchar8_t *end)
|
||||||
{ return end - ptr; }
|
{ return end - ptr; }
|
||||||
|
|
||||||
|
static void advanceByte(const char *&ptr, qsizetype n = 1)
|
||||||
|
{ ptr += n; }
|
||||||
|
|
||||||
static void advanceByte(const uchar *&ptr, qsizetype n = 1)
|
static void advanceByte(const uchar *&ptr, qsizetype n = 1)
|
||||||
{ ptr += n; }
|
{ ptr += n; }
|
||||||
|
|
||||||
|
@ -570,6 +570,12 @@ private slots:
|
|||||||
void hasAttribute() const;
|
void hasAttribute() const;
|
||||||
void writeWithUtf8Codec() const;
|
void writeWithUtf8Codec() const;
|
||||||
void writeWithStandalone() const;
|
void writeWithStandalone() const;
|
||||||
|
void writeCharacters_data() const;
|
||||||
|
void writeCharacters() const;
|
||||||
|
void writeAttribute_data() const;
|
||||||
|
void writeAttribute() const;
|
||||||
|
void writeBadCharactersUtf8_data() const;
|
||||||
|
void writeBadCharactersUtf8() const;
|
||||||
void entitiesAndWhitespace_1() const;
|
void entitiesAndWhitespace_1() const;
|
||||||
void entitiesAndWhitespace_2() const;
|
void entitiesAndWhitespace_2() const;
|
||||||
void testFalsePrematureError() const;
|
void testFalsePrematureError() const;
|
||||||
@ -1380,6 +1386,125 @@ void tst_QXmlStream::writeWithStandalone() const
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void writeCharacters_data_common()
|
||||||
|
{
|
||||||
|
QTest::addColumn<QString>("input");
|
||||||
|
QTest::addColumn<QString>("output");
|
||||||
|
|
||||||
|
QTest::newRow("empty") << QString() << QString();
|
||||||
|
|
||||||
|
// invalid content
|
||||||
|
QTest::newRow("null-character") << u"\0"_s << QString();
|
||||||
|
QTest::newRow("vertical-tab") << "\v" << QString();
|
||||||
|
QTest::newRow("form-feed") << "\f" << QString();
|
||||||
|
QTest::newRow("esc") << "\x1f" << QString();
|
||||||
|
QTest::newRow("U+FFFE") << u"\xfffe"_s << QString();
|
||||||
|
QTest::newRow("U+FFFF") << u"\xffff"_s << QString();
|
||||||
|
|
||||||
|
// simple strings
|
||||||
|
QTest::newRow("us-ascii") << "Hello, world" << "Hello, world";
|
||||||
|
QTest::newRow("latin1") << "Bokmål" << "Bokmål";
|
||||||
|
QTest::newRow("nonlatin1") << "Ελληνικά" << "Ελληνικά";
|
||||||
|
QTest::newRow("nonbmp") << u"\U00010000"_s << u"\U00010000"_s;
|
||||||
|
|
||||||
|
// escaped content
|
||||||
|
QTest::newRow("less-than") << "<" << "<";
|
||||||
|
QTest::newRow("greater-than") << ">" << ">";
|
||||||
|
QTest::newRow("ampersand") << "&" << "&";
|
||||||
|
QTest::newRow("quote") << "\"" << """;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename Execute, typename Transform>
|
||||||
|
static void writeCharacters_common(Execute &&exec, Transform &&transform)
|
||||||
|
{
|
||||||
|
QFETCH(QString, input);
|
||||||
|
QFETCH(QString, output);
|
||||||
|
QStringView utf16 = input;
|
||||||
|
QByteArray utf8ba = input.toUtf8();
|
||||||
|
QUtf8StringView utf8(utf8ba);
|
||||||
|
|
||||||
|
// may be invalid if input is not Latin1
|
||||||
|
QByteArray l1ba = input.toLatin1();
|
||||||
|
QLatin1StringView l1(l1ba);
|
||||||
|
if (l1 != input)
|
||||||
|
l1 = {};
|
||||||
|
|
||||||
|
auto write = [&](auto input) -> std::optional<QString> {
|
||||||
|
QString result;
|
||||||
|
QXmlStreamWriter writer(&result);
|
||||||
|
writer.writeStartElement("a");
|
||||||
|
exec(writer, input);
|
||||||
|
writer.writeEndElement();
|
||||||
|
if (writer.hasError())
|
||||||
|
return std::nullopt;
|
||||||
|
return result;
|
||||||
|
};
|
||||||
|
|
||||||
|
if (input.isNull() != output.isNull()) {
|
||||||
|
// error
|
||||||
|
QCOMPARE(write(utf16), std::nullopt);
|
||||||
|
QCOMPARE(write(utf8), std::nullopt);
|
||||||
|
if (!l1.isEmpty())
|
||||||
|
QCOMPARE(write(l1), std::nullopt);
|
||||||
|
} else {
|
||||||
|
output = transform(output);
|
||||||
|
QCOMPARE(write(utf16), output);
|
||||||
|
QCOMPARE(write(utf8), output);
|
||||||
|
if (!l1.isEmpty())
|
||||||
|
QCOMPARE(write(l1), output);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void tst_QXmlStream::writeCharacters_data() const
|
||||||
|
{
|
||||||
|
writeCharacters_data_common();
|
||||||
|
QTest::newRow("tab") << "\t" << "\t";
|
||||||
|
QTest::newRow("newline") << "\n" << "\n";
|
||||||
|
QTest::newRow("carriage-return") << "\r" << "\r";
|
||||||
|
}
|
||||||
|
|
||||||
|
void tst_QXmlStream::writeCharacters() const
|
||||||
|
{
|
||||||
|
auto exec = [](QXmlStreamWriter &writer, auto input) {
|
||||||
|
writer.writeCharacters(input);
|
||||||
|
};
|
||||||
|
auto transform = [](auto output) { return "<a>" + output + "</a>"; };
|
||||||
|
writeCharacters_common(exec, transform);
|
||||||
|
}
|
||||||
|
|
||||||
|
void tst_QXmlStream::writeAttribute_data() const
|
||||||
|
{
|
||||||
|
writeCharacters_data_common();
|
||||||
|
QTest::newRow("tab") << "\t" << "	";
|
||||||
|
QTest::newRow("newline") << "\n" << " ";
|
||||||
|
QTest::newRow("carriage-return") << "\r" << " ";
|
||||||
|
}
|
||||||
|
|
||||||
|
void tst_QXmlStream::writeAttribute() const
|
||||||
|
{
|
||||||
|
auto exec = [](QXmlStreamWriter &writer, auto input) {
|
||||||
|
writer.writeAttribute("b", input);
|
||||||
|
};
|
||||||
|
auto transform = [](auto output) { return "<a b=\"" + output + "\"/>"; };
|
||||||
|
writeCharacters_common(exec, transform);
|
||||||
|
}
|
||||||
|
|
||||||
|
#include "../../io/qurlinternal/utf8data.cpp"
|
||||||
|
void tst_QXmlStream::writeBadCharactersUtf8_data() const
|
||||||
|
{
|
||||||
|
QTest::addColumn<QByteArray>("input");
|
||||||
|
loadInvalidUtf8Rows();
|
||||||
|
}
|
||||||
|
|
||||||
|
void tst_QXmlStream::writeBadCharactersUtf8() const
|
||||||
|
{
|
||||||
|
QFETCH(QByteArray, input);
|
||||||
|
QString target;
|
||||||
|
QXmlStreamWriter writer(&target);
|
||||||
|
writer.writeTextElement("a", QUtf8StringView(input));
|
||||||
|
QVERIFY(writer.hasError());
|
||||||
|
}
|
||||||
|
|
||||||
void tst_QXmlStream::entitiesAndWhitespace_1() const
|
void tst_QXmlStream::entitiesAndWhitespace_1() const
|
||||||
{
|
{
|
||||||
QXmlStreamReader reader(QLatin1String("<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN\" \"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd\"><test>&extEnt;</test>"));
|
QXmlStreamReader reader(QLatin1String("<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN\" \"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd\"><test>&extEnt;</test>"));
|
||||||
|
Loading…
x
Reference in New Issue
Block a user