Add QStringDecoder::decoderForHtml()

Now that QStringConverter can handle non UTF encodings through ICU,
add a way to get a decoder for arbitrary HTML code.

Opposed to QStringConverter::encodingForHtml(), this method will
try to create a valid string decoder also for non unicode codecs.

Pick-to: 6.4
Change-Id: I343584da1b114396c744f482d9b433c9cedcc511
Reviewed-by: Fabian Kosmale <fabian.kosmale@qt.io>
Reviewed-by: Lars Knoll <lars.knoll@qt.io>
Reviewed-by: Thiago Macieira <thiago.macieira@intel.com>
This commit is contained in:
Lars Knoll 2022-05-31 11:58:50 +02:00 committed by Fabian Kosmale
parent d531c4b65d
commit 9c1f3b6d4d
5 changed files with 124 additions and 43 deletions

View File

@ -127,10 +127,9 @@ QVariant QMimeDataPrivate::retrieveTypedData(const QString &format, QMetaType ty
if (ba.isNull())
return QVariant();
if (format == "text/html"_L1) {
auto encoding = QStringConverter::encodingForHtml(ba);
if (encoding) {
QStringDecoder toUtf16(*encoding);
return QString(toUtf16(ba));
QStringDecoder decoder = QStringDecoder::decoderForHtml(ba);
if (decoder.isValid()) {
return QString(decoder(ba));
}
// fall back to utf8
}

View File

@ -2049,20 +2049,8 @@ QStringConverter::encodingForData(QByteArrayView data, char16_t expectedFirstCha
return std::nullopt;
}
/*!
Tries to determine the encoding of the HTML in \a data by looking at leading byte
order marks or a charset specifier in the HTML meta tag. If the optional is empty,
the encoding specified is not supported by QStringConverter. If no encoding is
detected, the method returns Utf8.
*/
std::optional<QStringConverter::Encoding> QStringConverter::encodingForHtml(QByteArrayView data)
static QByteArray parseHtmlMetaForEncoding(QByteArrayView data)
{
// determine charset
auto encoding = encodingForData(data);
if (encoding)
// trust the initial BOM
return encoding;
static constexpr auto metaSearcher = qMakeStaticByteArrayMatcher("meta ");
static constexpr auto charsetSearcher = qMakeStaticByteArrayMatcher("charset=");
@ -2089,14 +2077,62 @@ std::optional<QStringConverter::Encoding> QStringConverter::encodingForHtml(QByt
if (name == "unicode") // QTBUG-41998, ICU will return UTF-16.
name = QByteArrayLiteral("UTF-8");
if (!name.isEmpty())
return encodingForName(name);
return name;
}
}
}
}
return QByteArray();
}
/*!
Tries to determine the encoding of the HTML in \a data by looking at leading byte
order marks or a charset specifier in the HTML meta tag. If the optional is empty,
the encoding specified is not supported by QStringConverter. If no encoding is
detected, the method returns Utf8.
\sa QStringDecoder::decoderForHtml()
*/
std::optional<QStringConverter::Encoding> QStringConverter::encodingForHtml(QByteArrayView data)
{
// determine charset
std::optional<QStringConverter::Encoding> encoding = encodingForData(data);
if (encoding)
// trust the initial BOM
return encoding;
QByteArray encodingTag = parseHtmlMetaForEncoding(data);
if (!encodingTag.isEmpty())
return encodingForName(encodingTag);
return Utf8;
}
/*!
Tries to determine the encoding of the HTML in \a data by looking at leading byte
order marks or a charset specifier in the HTML meta tag and returns a QStringDecoder
matching the encoding. If the returned decoder is not valid,
the encoding specified is not supported by QStringConverter. If no encoding is
detected, the method returns a decoder for Utf8.
\sa isValid()
*/
QStringDecoder QStringDecoder::decoderForHtml(QByteArrayView data)
{
// determine charset
std::optional<QStringConverter::Encoding> encoding = encodingForData(data);
if (encoding)
// trust the initial BOM
return QStringDecoder(encoding.value());
QByteArray encodingTag = parseHtmlMetaForEncoding(data);
if (!encodingTag.isEmpty())
return QStringDecoder(encodingTag);
return QStringDecoder(Utf8);
}
/*!
Returns the canonical name for encoding \a e.
*/

View File

@ -140,6 +140,9 @@ public:
}
return iface->toUtf16(out, ba, &state);
}
Q_CORE_EXPORT static QStringDecoder decoderForHtml(QByteArrayView data);
private:
QString decodeAsString(QByteArrayView in)
{

View File

@ -288,12 +288,11 @@ void QTextBrowserPrivate::setSource(const QUrl &url, QTextDocument::ResourceType
} else if (data.userType() == QMetaType::QByteArray) {
QByteArray ba = data.toByteArray();
if (type == QTextDocument::HtmlResource) {
auto encoding = QStringConverter::encodingForHtml(ba);
if (!encoding)
auto decoder = QStringDecoder::decoderForHtml(ba);
if (!decoder.isValid())
// fall back to utf8
encoding = QStringDecoder::Utf8;
QStringDecoder toUtf16(*encoding);
txt = toUtf16(ba);
decoder = QStringDecoder(QStringDecoder::Utf8);
txt = decoder(ba);
} else {
txt = QString::fromUtf8(ba);
}

View File

@ -2256,65 +2256,102 @@ void tst_QStringConverter::encodingForHtml_data()
{
QTest::addColumn<QByteArray>("html");
QTest::addColumn<std::optional<QStringConverter::Encoding>>("encoding");
QTest::addColumn<QByteArray>("name"); // ICU name if we have ICU support
QByteArray html = "<html><head></head><body>blah</body></html>";
QTest::newRow("no charset") << html << std::optional<QStringConverter::Encoding>(QStringConverter::Utf8);
QTest::newRow("no charset") << html << std::optional<QStringConverter::Encoding>(QStringConverter::Utf8) << QByteArray("UTF-8");
html = "<html><head><meta http-equiv=\"content-type\" content=\"text/html; charset=ISO-8859-15\" /></head></html>";
QTest::newRow("latin 15") << html << std::optional<QStringConverter::Encoding>();
QTest::newRow("latin 15") << html << std::optional<QStringConverter::Encoding>() << QByteArray("ISO-8859-15");
html = "<html><head><meta http-equiv=\"content-type\" content=\"text/html; charset=SJIS\" /></head></html>";
QTest::newRow("sjis") << html << std::optional<QStringConverter::Encoding>() << QByteArray("Shift_JIS");
html = "<html><head><meta http-equiv=\"content-type\" content=\"text/html; charset=ISO-2022-JP\" /></head></html>";
QTest::newRow("ISO-2022-JP") << html << std::optional<QStringConverter::Encoding>() << QByteArray("ISO-2022-JP");
html = "<html><head><meta http-equiv=\"content-type\" content=\"text/html; charset=ISO-2022\" /></head></html>";
QTest::newRow("ISO-2022") << html << std::optional<QStringConverter::Encoding>() << QByteArray("ISO-2022-JP");
html = "<html><head><meta http-equiv=\"content-type\" content=\"text/html; charset=GB2312\" /></head></html>";
QTest::newRow("GB2312") << html << std::optional<QStringConverter::Encoding>() << QByteArray("GB2312");
html = "<html><head><meta http-equiv=\"content-type\" content=\"text/html; charset=Big5\" /></head></html>";
QTest::newRow("Big5") << html << std::optional<QStringConverter::Encoding>() << QByteArray("Big5");
html = "<html><head><meta http-equiv=\"content-type\" content=\"text/html; charset=GB18030\" /></head></html>";
QTest::newRow("GB18030") << html << std::optional<QStringConverter::Encoding>() << QByteArray("GB18030");
html = "<html><head><meta http-equiv=\"content-type\" content=\"text/html; charset=GB2312-HKSCS\" /></head></html>";
QTest::newRow("GB2312-HKSCS") << html << std::optional<QStringConverter::Encoding>() << QByteArray("GB2312-HKSCS");
html = "<html><head><meta http-equiv=\"content-type\" content=\"text/html; charset=Big5-HKSCS\" /></head></html>";
QTest::newRow("Big5-HKSCS") << html << std::optional<QStringConverter::Encoding>() << QByteArray("Big5-HKSCS");
html = "<html><head><meta http-equiv=\"content-type\" content=\"text/html; charset=EucJP\" /></head></html>";
QTest::newRow("EucJP") << html << std::optional<QStringConverter::Encoding>() << QByteArray("EUC-JP");
html = "<html><head><meta http-equiv=\"content-type\" content=\"text/html; charset=EucKR\" /></head></html>";
QTest::newRow("EucKR") << html << std::optional<QStringConverter::Encoding>() << QByteArray("EUC-KR");
html = "<html><head><meta http-equiv=\"content-type\" content=\"text/html; charset=KOI8-R\" /></head></html>";
QTest::newRow("KOI8-R") << html << std::optional<QStringConverter::Encoding>() << QByteArray("KOI8-R");
html = "<html><head><meta http-equiv=\"content-type\" content=\"text/html; charset=KOI8-U\" /></head></html>";
QTest::newRow("KOI8-U") << html << std::optional<QStringConverter::Encoding>() << QByteArray("KOI8-U");
html = "<html><head><meta http-equiv=\"content-type\" content=\"text/html; charset=ISO-8859-1\" /></head></html>";
QTest::newRow("latin 1") << html << std::optional<QStringConverter::Encoding>(QStringConverter::Latin1);
QTest::newRow("latin 1") << html << std::optional<QStringConverter::Encoding>(QStringConverter::Latin1) << QByteArray("ISO-8859-1");
html = "<!DOCTYPE html><html><head><meta charset=\"ISO_8859-1:1987\"><meta http-equiv=\"X-UA-Compatible\" content=\"IE=9,chrome=1\"><title>Test</title></head>";
QTest::newRow("latin 1 (#2)") << html << std::optional<QStringConverter::Encoding>(QStringConverter::Latin1);
QTest::newRow("latin 1 (#2)") << html << std::optional<QStringConverter::Encoding>(QStringConverter::Latin1) << QByteArray("ISO-8859-1");
html = "<!DOCTYPE html><html><head><meta charset=\"utf-8\"><meta http-equiv=\"X-UA-Compatible\" content=\"IE=9,chrome=1\"><title>Test</title></head>";
QTest::newRow("UTF-8") << html << std::optional<QStringConverter::Encoding>(QStringConverter::Utf8);
QTest::newRow("UTF-8") << html << std::optional<QStringConverter::Encoding>(QStringConverter::Utf8) << QByteArray("UTF-8");
html = "<!DOCTYPE html><html><head><meta http-equiv=\"X-UA-Compatible\" content=\"IE=9,chrome=1\"><meta charset=\"utf-8\"><title>Test</title></head>";
QTest::newRow("UTF-8 (#2)") << html << std::optional<QStringConverter::Encoding>(QStringConverter::Utf8);
QTest::newRow("UTF-8 (#2)") << html << std::optional<QStringConverter::Encoding>(QStringConverter::Utf8) << QByteArray("UTF-8");
html = "<html><head><meta http-equiv=\"content-type\" content=\"text/html; charset=UTF-8/></head></html>";
QTest::newRow("UTF-8, no quotes") << html << std::optional<QStringConverter::Encoding>(QStringConverter::Utf8);
QTest::newRow("UTF-8, no quotes") << html << std::optional<QStringConverter::Encoding>(QStringConverter::Utf8) << QByteArray("UTF-8");
html = "<html><head><meta http-equiv=\"content-type\" content=\"text/html; charset='UTF-8'/></head></html>";
QTest::newRow("UTF-8, single quotes") << html << std::optional<QStringConverter::Encoding>(QStringConverter::Utf8);
QTest::newRow("UTF-8, single quotes") << html << std::optional<QStringConverter::Encoding>(QStringConverter::Utf8) << QByteArray("UTF-8");
html = "<!DOCTYPE html><html><head><meta charset=utf-8><title>Test</title></head>";
QTest::newRow("UTF-8, > terminator") << html << std::optional<QStringConverter::Encoding>(QStringConverter::Utf8);
QTest::newRow("UTF-8, > terminator") << html << std::optional<QStringConverter::Encoding>(QStringConverter::Utf8) << QByteArray("UTF-8");
html = "<!DOCTYPE html><html><head><meta charset= utf-8 ><title>Test</title></head>";
QTest::newRow("UTF-8, > terminator with spaces") << html << std::optional<QStringConverter::Encoding>(QStringConverter::Utf8);
QTest::newRow("UTF-8, > terminator with spaces") << html << std::optional<QStringConverter::Encoding>(QStringConverter::Utf8) << QByteArray("UTF-8");
// Test invalid charsets.
html = "<!DOCTYPE html><html><head><meta charset= utf/8 ><title>Test</title></head>";
QTest::newRow("utf/8") << html << std::optional<QStringConverter::Encoding>();
QTest::newRow("utf/8") << html << std::optional<QStringConverter::Encoding>() << QByteArray();
html = "<html><head><meta http-equiv=\"content-type\" content=\"text/html; charset=invalid-foo\" /></head></html>";
QTest::newRow("invalid charset, no default") << html << std::optional<QStringConverter::Encoding>();
QTest::newRow("invalid charset, no default") << html << std::optional<QStringConverter::Encoding>() << QByteArray("UTF-8");
html = "<!DOCTYPE html><html><head><meta http-equiv=\"X-UA-Compatible\" content=\"IE=9,chrome=1\"><meta charset=\"";
html.prepend(QByteArray().fill(' ', 512 - html.size()));
QTest::newRow("invalid charset (large header)") << html << std::optional<QStringConverter::Encoding>(QStringConverter::Utf8);
QTest::newRow("invalid charset (large header)") << html << std::optional<QStringConverter::Encoding>(QStringConverter::Utf8) << QByteArray("UTF-8");
html = "<!DOCTYPE html><html><head><meta http-equiv=\"X-UA-Compatible\" content=\"IE=9,chrome=1\"><meta charset=\"utf-8";
QTest::newRow("invalid charset (no closing double quote)") << html << std::optional<QStringConverter::Encoding>(QStringConverter::Utf8);
QTest::newRow("invalid charset (no closing double quote)") << html << std::optional<QStringConverter::Encoding>(QStringConverter::Utf8) << QByteArray("UTF-8");
html = "<!DOCTYPE html><html><head><meta http-equiv=\"X-UA-Compatible\" content=\"IE=9,chrome=1\"><meta charset='utf-8";
QTest::newRow("invalid charset (no closing single quote)") << html << std::optional<QStringConverter::Encoding>(QStringConverter::Utf8);
QTest::newRow("invalid charset (no closing single quote)") << html << std::optional<QStringConverter::Encoding>(QStringConverter::Utf8) << QByteArray("UTF-8");
html = "<!DOCTYPE html><html><head><meta charset=utf-8 foo=bar><title>Test</title></head>";
QTest::newRow("invalid (space terminator)") << html << std::optional<QStringConverter::Encoding>();
QTest::newRow("invalid (space terminator)") << html << std::optional<QStringConverter::Encoding>() << QByteArray();
html = "<!DOCTYPE html><html><head><meta charset=\" utf' 8 /><title>Test</title></head>";
QTest::newRow("invalid charset, early terminator (')") << html << std::optional<QStringConverter::Encoding>();
QTest::newRow("invalid charset, early terminator (')") << html << std::optional<QStringConverter::Encoding>() << QByteArray();
const char src[] = { char(0xff), char(0xfe), char(0x7a), char(0x03), 0, 0 };
html = src;
QTest::newRow("greek text UTF-16LE") << html << std::optional<QStringConverter::Encoding>(QStringConverter::Utf16LE);
QTest::newRow("greek text UTF-16LE") << html << std::optional<QStringConverter::Encoding>(QStringConverter::Utf16LE) << QByteArray("UTF-16LE");
html = "<meta http-equiv=\"content-type\" content=\"text/html; charset=utf-8\"><span style=\"color: rgb(0, 0, 0); font-family: "
@ -2322,19 +2359,26 @@ void tst_QStringConverter::encodingForHtml_data()
"line-height: normal; orphans: auto; text-align: start; text-indent: 0px; text-transform: none; white-space: normal; widows: "
"auto; word-spacing: 0px; -webkit-text-size-adjust: auto; -webkit-text-stroke-width: 0px; display: inline !important; float: "
"none;\">&#x37b</span>\000";
QTest::newRow("greek text UTF-8") << html << std::optional<QStringConverter::Encoding>(QStringConverter::Utf8);
QTest::newRow("greek text UTF-8") << html << std::optional<QStringConverter::Encoding>(QStringConverter::Utf8) << QByteArray("UTF-8");
html = "<!DOCTYPE html><html><head><meta http-equiv=\"Content-Type\" content=\"text/html; charset=unicode\">"
"<head/><body><p>bla</p></body></html>"; // QTBUG-41998, ICU will return UTF-16.
QTest::newRow("legacy unicode UTF-8") << html << std::optional<QStringConverter::Encoding>(QStringConverter::Utf8);
QTest::newRow("legacy unicode UTF-8") << html << std::optional<QStringConverter::Encoding>(QStringConverter::Utf8) << QByteArray("UTF-8");
}
void tst_QStringConverter::encodingForHtml()
{
QFETCH(QByteArray, html);
QFETCH(std::optional<QStringConverter::Encoding>, encoding);
QFETCH(QByteArray, name);
QCOMPARE(QStringConverter::encodingForHtml(html), encoding);
QStringDecoder decoder = QStringDecoder::decoderForHtml(html);
if (encoding || // we should have a valid decoder independent of ICU support
decoder.isValid()) { // we got a valid decoder through ICU
QCOMPARE(decoder.name(), name);
}
}
class LoadAndConvert: public QRunnable