From 046b8523f20bdc86373eff7dfc2a1c0f7bb297d4 Mon Sep 17 00:00:00 2001 From: Ivan Solovev Date: Mon, 24 Mar 2025 13:36:13 +0100 Subject: [PATCH] QXmlStreamReader: check appending data with unexpected encoding If we add the first part of the XML document using a QBA overload of the constructor or addData() method, the encoding of the document will be extracted from the "encoding" attribute of the XML prolog. This way, if the encoding is different from UTF-8, appending the data with the QASV overload of addData() will always result in reading the data incorrectly, because the implementation converts the input data to UTF-8. This test explicitly highlights this behavior. In some cases the current behavior is probably fine. However, cases like appending Latin1 to Latin1-encoded document or appending UTF-16 to UTF-16-encoded document should just work instead of doing the unwanted conversion to UTF-8 and failing. Task-number: QTBUG-124636 Task-number: QTBUG-135129 Pick-to: 6.9 6.8 6.5 Change-Id: Idf0571083e56032145478631538f09d251cb1022 Reviewed-by: Marc Mutz --- .../qxmlstream/tst_qxmlstream.cpp | 75 +++++++++++++++++++ 1 file changed, 75 insertions(+) diff --git a/tests/auto/corelib/serialization/qxmlstream/tst_qxmlstream.cpp b/tests/auto/corelib/serialization/qxmlstream/tst_qxmlstream.cpp index 6f3ce0e2c02..76e53874480 100644 --- a/tests/auto/corelib/serialization/qxmlstream/tst_qxmlstream.cpp +++ b/tests/auto/corelib/serialization/qxmlstream/tst_qxmlstream.cpp @@ -571,6 +571,8 @@ private slots: void readFromQBufferInvalid() const; void readFromLatin1String() const; void readLatin1Document() const; + void appendToRawDocumentWithNonUtf8Encoding_data(); + void appendToRawDocumentWithNonUtf8Encoding(); void readNextStartElement() const; void readElementText() const; void readElementText_data() const; @@ -1251,6 +1253,79 @@ void tst_QXmlStream::readLatin1Document() const } } +void tst_QXmlStream::appendToRawDocumentWithNonUtf8Encoding_data() +{ + QTest::addColumn("rawDocumentStart"); + QTest::addColumn("expectedFirstElementText"); + QTest::addColumn("nextData"); + QTest::addColumn("nextEncoding"); + QTest::addColumn("expectedNextElementText"); + + auto row = [](const char *name, const QByteArray &encoding, + const QByteArray &firstData, const QString &expectedFirstString, + QStringConverter::Encoding nextEncoding, const QString &nextString) { + const QByteArray docStart = "" + firstData + ""; + const QString nextElement = u""_s + nextString + u""_s; + QTest::newRow(name) << docStart << expectedFirstString << nextElement + << nextEncoding << nextString; + }; + + row("l1+utf16", "iso-8859-1"_ba, "M\xE5rten"_ba, QString::fromLatin1("M\xE5rten"), + QStringConverter::Utf16, u"M\u00E5rten"_s); + row("l1+utf8", "iso-8859-1"_ba, "M\xE5rten"_ba, QString::fromLatin1("M\xE5rten"), + QStringConverter::Utf8, QString::fromUtf8("M\xC3\xA5rten")); + // Even this fails, because we internally convert the second L1 to UTF-8! + row("l1+l1", "iso-8859-1"_ba, "M\xE5rten"_ba, QString::fromLatin1("M\xE5rten"), + QStringConverter::Latin1, QString::fromLatin1("M\xE5rten")); + + const QString utf16Str = u"" + "M\u00E5rten"_s; + const QByteArray utf16Data{reinterpret_cast(utf16Str.utf16()), + utf16Str.size() * 2}; + + QTest::newRow("utf16+utf16") << utf16Data << u"M\u00E5rten"_s + << u"M\u00E5rten"_s + << QStringConverter::Utf16 + << u"M\u00E5rten"_s; +} + +void tst_QXmlStream::appendToRawDocumentWithNonUtf8Encoding() +{ + QFETCH(const QByteArray, rawDocumentStart); + QFETCH(const QString, expectedFirstElementText); + QFETCH(const QString, nextData); + QFETCH(const QStringConverter::Encoding, nextEncoding); + QFETCH(const QString, expectedNextElementText); + + QXmlStreamReader reader(rawDocumentStart); + QVERIFY(reader.readNextStartElement()); // foo + QVERIFY(reader.readNextStartElement()); // a + QString text = reader.readElementText(); + QCOMPARE(text, expectedFirstElementText); + + switch (nextEncoding) { + case QStringConverter::Utf16: + reader.addData(nextData); + break; + case QStringConverter::Utf8: + reader.addData(QUtf8StringView{nextData.toUtf8()}); + break; + case QStringConverter::Latin1: + reader.addData(QLatin1StringView{nextData.toLatin1()}); + break; + default: + Q_UNREACHABLE(); + } + QEXPECT_FAIL("utf16+utf16", "QTBUG-135129: Parser expected UTF-16, but got UTF-8", Abort); + QVERIFY(reader.readNextStartElement()); // a + text = reader.readElementText(); + + QEXPECT_FAIL("", "Parser expects the data in the initial encoding, but we convert to UTF-8", + Continue); + QCOMPARE(text, expectedNextElementText); +} + void tst_QXmlStream::readNextStartElement() const { QLatin1String in("text");