QRestReply: make charset parsing more robust
The old code was based on QString::split and e.g. didn't handle escapes in quoted-strings. Write a modern recursive-descent parser to parse out type/subtype and charset parameter. Since we don't, yet, support CFWS (see below), recursion depth is strictly limited, so we're not susceptible to recursion bombs here. It currently handles only RFC9110-style grammar, but can be easily extended to support CFWS (RFC2822-style comments and folding white-space) or RFC2231-style continuuations, if needed. It's a bit more general than strictly required, because I expect this to be reused elsewhere ere long. Manual conflict resolutions: - dropped a few constexpr from functions which use QByteArrayView's startsWith(), which is only constexpr since 6.8 Fixes: QTBUG-120307 Change-Id: I309928dc350a043672dffb4a259b457764c031be Reviewed-by: Juha Vuolle <juha.vuolle@qt.io> (cherry picked from commit 98b034e53a7821018683c05d5dba776f0f2753f0)
This commit is contained in:
parent
a8ef8ea550
commit
8302c727e6
@ -4,11 +4,17 @@
|
|||||||
#include "qrestreply.h"
|
#include "qrestreply.h"
|
||||||
#include "qrestreply_p.h"
|
#include "qrestreply_p.h"
|
||||||
|
|
||||||
|
#include <QtNetwork/private/qnetworkreply_p.h>
|
||||||
|
|
||||||
|
#include <QtCore/qbytearrayview.h>
|
||||||
#include <QtCore/qjsondocument.h>
|
#include <QtCore/qjsondocument.h>
|
||||||
#include <QtCore/qlatin1stringmatcher.h>
|
#include <QtCore/qlatin1stringmatcher.h>
|
||||||
|
#include <QtCore/qlatin1stringview.h>
|
||||||
#include <QtCore/qloggingcategory.h>
|
#include <QtCore/qloggingcategory.h>
|
||||||
#include <QtCore/qstringconverter.h>
|
#include <QtCore/qstringconverter.h>
|
||||||
|
|
||||||
|
#include <QtCore/qxpfunctional.h>
|
||||||
|
|
||||||
QT_BEGIN_NAMESPACE
|
QT_BEGIN_NAMESPACE
|
||||||
|
|
||||||
using namespace Qt::StringLiterals;
|
using namespace Qt::StringLiterals;
|
||||||
@ -335,6 +341,201 @@ QDebug operator<<(QDebug debug, const QRestReply &reply)
|
|||||||
}
|
}
|
||||||
#endif // QT_NO_DEBUG_STREAM
|
#endif // QT_NO_DEBUG_STREAM
|
||||||
|
|
||||||
|
static constexpr auto parse_OWS(QByteArrayView data) noexcept
|
||||||
|
{
|
||||||
|
struct R {
|
||||||
|
QByteArrayView ows, tail;
|
||||||
|
};
|
||||||
|
|
||||||
|
constexpr auto is_OWS_char = [](auto ch) { return ch == ' ' || ch == '\t'; };
|
||||||
|
|
||||||
|
qsizetype i = 0;
|
||||||
|
while (i < data.size() && is_OWS_char(data[i]))
|
||||||
|
++i;
|
||||||
|
|
||||||
|
return R{data.first(i), data.sliced(i)};
|
||||||
|
}
|
||||||
|
|
||||||
|
static constexpr void eat_OWS(QByteArrayView &data) noexcept
|
||||||
|
{
|
||||||
|
data = parse_OWS(data).tail;
|
||||||
|
}
|
||||||
|
|
||||||
|
static auto parse_quoted_string(QByteArrayView data, qxp::function_ref<void(char) const> yield)
|
||||||
|
{
|
||||||
|
struct R {
|
||||||
|
QByteArrayView quotedString, tail;
|
||||||
|
constexpr explicit operator bool() const noexcept { return !quotedString.isEmpty(); }
|
||||||
|
};
|
||||||
|
|
||||||
|
if (!data.startsWith('"'))
|
||||||
|
return R{{}, data};
|
||||||
|
|
||||||
|
qsizetype i = 1; // one past initial DQUOTE
|
||||||
|
while (i < data.size()) {
|
||||||
|
switch (auto ch = data[i++]) {
|
||||||
|
case '"': // final DQUOTE -> end of string
|
||||||
|
return R{data.first(i), data.sliced(i)};
|
||||||
|
case '\\': // quoted-pair
|
||||||
|
// https://www.rfc-editor.org/rfc/rfc9110.html#section-5.6.4-3:
|
||||||
|
// Recipients that process the value of a quoted-string MUST handle a
|
||||||
|
// quoted-pair as if it were replaced by the octet following the backslash.
|
||||||
|
if (i == data.size())
|
||||||
|
break; // premature end
|
||||||
|
ch = data[i++]; // eat '\\'
|
||||||
|
[[fallthrough]];
|
||||||
|
default:
|
||||||
|
// we don't validate quoted-string octets to be only qdtext (Postel's Law)
|
||||||
|
yield(ch);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return R{{}, data}; // premature end
|
||||||
|
}
|
||||||
|
|
||||||
|
static constexpr bool is_tchar(char ch) noexcept
|
||||||
|
{
|
||||||
|
// ### optimize
|
||||||
|
switch (ch) {
|
||||||
|
case '!':
|
||||||
|
case '#':
|
||||||
|
case '$':
|
||||||
|
case '%':
|
||||||
|
case '&':
|
||||||
|
case '\'':
|
||||||
|
case '*':
|
||||||
|
case '+':
|
||||||
|
case '-':
|
||||||
|
case '.':
|
||||||
|
case '^':
|
||||||
|
case '_':
|
||||||
|
case '`':
|
||||||
|
case '|':
|
||||||
|
case '~':
|
||||||
|
return true;
|
||||||
|
default:
|
||||||
|
return (ch >= 'a' && ch <= 'z')
|
||||||
|
|| (ch >= '0' && ch <= '9')
|
||||||
|
|| (ch >= 'A' && ch <= 'Z');
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static constexpr auto parse_token(QByteArrayView data) noexcept
|
||||||
|
{
|
||||||
|
struct R {
|
||||||
|
QByteArrayView token, tail;
|
||||||
|
constexpr explicit operator bool() const noexcept { return !token.isEmpty(); }
|
||||||
|
};
|
||||||
|
|
||||||
|
qsizetype i = 0;
|
||||||
|
while (i < data.size() && is_tchar(data[i]))
|
||||||
|
++i;
|
||||||
|
|
||||||
|
return R{data.first(i), data.sliced(i)};
|
||||||
|
}
|
||||||
|
|
||||||
|
static auto parse_parameter(QByteArrayView data, qxp::function_ref<void(char) const> yield)
|
||||||
|
{
|
||||||
|
struct R {
|
||||||
|
QLatin1StringView name; QByteArrayView value; QByteArrayView tail;
|
||||||
|
constexpr explicit operator bool() const noexcept { return !name.isEmpty(); }
|
||||||
|
};
|
||||||
|
|
||||||
|
const auto invalid = R{{}, {}, data}; // preserves original `data`
|
||||||
|
|
||||||
|
// parameter = parameter-name "=" parameter-value
|
||||||
|
// parameter-name = token
|
||||||
|
// parameter-value = ( token / quoted-string )
|
||||||
|
|
||||||
|
const auto name = parse_token(data);
|
||||||
|
if (!name)
|
||||||
|
return invalid;
|
||||||
|
data = name.tail;
|
||||||
|
|
||||||
|
eat_OWS(data); // not in the grammar, but accepted under Postel's Law
|
||||||
|
|
||||||
|
if (!data.startsWith('='))
|
||||||
|
return invalid;
|
||||||
|
data = data.sliced(1);
|
||||||
|
|
||||||
|
eat_OWS(data); // not in the grammar, but accepted under Postel's Law
|
||||||
|
|
||||||
|
if (Q_UNLIKELY(data.startsWith('"'))) { // value is a quoted-string
|
||||||
|
|
||||||
|
const auto value = parse_quoted_string(data, yield);
|
||||||
|
if (!value)
|
||||||
|
return invalid;
|
||||||
|
data = value.tail;
|
||||||
|
|
||||||
|
return R{QLatin1StringView{name.token}, value.quotedString, data};
|
||||||
|
|
||||||
|
} else { // value is a token
|
||||||
|
|
||||||
|
const auto value = parse_token(data);
|
||||||
|
if (!value)
|
||||||
|
return invalid;
|
||||||
|
data = value.tail;
|
||||||
|
|
||||||
|
return R{QLatin1StringView{name.token}, value.token, data};
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static auto parse_content_type(QByteArrayView data)
|
||||||
|
{
|
||||||
|
struct R {
|
||||||
|
QLatin1StringView type, subtype;
|
||||||
|
std::string charset;
|
||||||
|
constexpr explicit operator bool() const noexcept { return !type.isEmpty(); }
|
||||||
|
};
|
||||||
|
|
||||||
|
eat_OWS(data); // not in the grammar, but accepted under Postel's Law
|
||||||
|
|
||||||
|
const auto type = parse_token(data);
|
||||||
|
if (!type)
|
||||||
|
return R{};
|
||||||
|
data = type.tail;
|
||||||
|
|
||||||
|
eat_OWS(data); // not in the grammar, but accepted under Postel's Law
|
||||||
|
|
||||||
|
if (!data.startsWith('/'))
|
||||||
|
return R{};
|
||||||
|
data = data.sliced(1);
|
||||||
|
|
||||||
|
eat_OWS(data); // not in the grammar, but accepted under Postel's Law
|
||||||
|
|
||||||
|
const auto subtype = parse_token(data);
|
||||||
|
if (!subtype)
|
||||||
|
return R{};
|
||||||
|
data = subtype.tail;
|
||||||
|
|
||||||
|
eat_OWS(data);
|
||||||
|
|
||||||
|
auto r = R{QLatin1StringView{type.token}, QLatin1StringView{subtype.token}, {}};
|
||||||
|
|
||||||
|
while (data.startsWith(';')) {
|
||||||
|
|
||||||
|
data = data.sliced(1); // eat ';'
|
||||||
|
|
||||||
|
eat_OWS(data);
|
||||||
|
|
||||||
|
const auto param = parse_parameter(data, [&](char ch) { r.charset.append(1, ch); });
|
||||||
|
if (param.name.compare("charset"_L1, Qt::CaseInsensitive) == 0) {
|
||||||
|
if (r.charset.empty() && !param.value.startsWith('"')) // wasn't a quoted-string
|
||||||
|
r.charset.assign(param.value.begin(), param.value.end());
|
||||||
|
return r; // charset found
|
||||||
|
}
|
||||||
|
r.charset.clear(); // wasn't an actual charset
|
||||||
|
if (param.tail.size() == data.size()) // no progress was made
|
||||||
|
break; // returns {type, subtype}
|
||||||
|
// otherwise, continue (accepting e.g. `;;`)
|
||||||
|
data = param.tail;
|
||||||
|
|
||||||
|
eat_OWS(data);
|
||||||
|
}
|
||||||
|
|
||||||
|
return r; // no charset found
|
||||||
|
}
|
||||||
|
|
||||||
QByteArray QRestReplyPrivate::contentCharset(const QNetworkReply* reply)
|
QByteArray QRestReplyPrivate::contentCharset(const QNetworkReply* reply)
|
||||||
{
|
{
|
||||||
// Content-type consists of mimetype and optional parameters, of which one may be 'charset'
|
// Content-type consists of mimetype and optional parameters, of which one may be 'charset'
|
||||||
@ -345,28 +546,15 @@ QByteArray QRestReplyPrivate::contentCharset(const QNetworkReply* reply)
|
|||||||
// text/plain; charset=utf-8;version=1.7
|
// text/plain; charset=utf-8;version=1.7
|
||||||
// text/plain; charset = utf-8
|
// text/plain; charset = utf-8
|
||||||
// text/plain; charset ="utf-8"
|
// text/plain; charset ="utf-8"
|
||||||
// Default to the most commonly used UTF-8.
|
|
||||||
QByteArray charset{"UTF-8"};
|
|
||||||
const QByteArray contentTypeValue =
|
const QByteArray contentTypeValue =
|
||||||
reply->header(QNetworkRequest::KnownHeaders::ContentTypeHeader).toByteArray();
|
reply->header(QNetworkRequest::KnownHeaders::ContentTypeHeader).toByteArray();
|
||||||
|
|
||||||
QList<QByteArray> parameters = contentTypeValue.split(';');
|
const auto r = parse_content_type(contentTypeValue);
|
||||||
if (parameters.size() >= 2) { // Need at least one parameter in addition to the mimetype itself
|
if (r && !r.charset.empty())
|
||||||
parameters.removeFirst(); // Exclude the mimetype itself, only interested in parameters
|
return QByteArrayView(r.charset).toByteArray();
|
||||||
QLatin1StringMatcher matcher("charset="_L1, Qt::CaseSensitivity::CaseInsensitive);
|
else
|
||||||
qsizetype matchIndex = -1;
|
return "UTF-8"_ba; // Default to the most commonly used UTF-8.
|
||||||
for (auto ¶meter : parameters) {
|
|
||||||
// Remove whitespaces and parantheses
|
|
||||||
const QByteArray curated = parameter.replace(" ", "").replace("\"","");
|
|
||||||
// Check for match
|
|
||||||
matchIndex = matcher.indexIn(QLatin1String(curated.constData()));
|
|
||||||
if (matchIndex >= 0) {
|
|
||||||
charset = curated.sliced(matchIndex + 8); // 8 is size of "charset="
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return charset;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
QT_END_NAMESPACE
|
QT_END_NAMESPACE
|
||||||
|
@ -666,7 +666,7 @@ void tst_QRestAccessManager::json()
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#define VERIFY_TEXT_REPLY_OK_IMPL(...) \
|
#define VERIFY_TEXT_REPLY_OK \
|
||||||
{ \
|
{ \
|
||||||
manager.get(request, this, [&](QRestReply &reply) { networkReply = reply.networkReply(); }); \
|
manager.get(request, this, [&](QRestReply &reply) { networkReply = reply.networkReply(); }); \
|
||||||
QTRY_VERIFY(networkReply); \
|
QTRY_VERIFY(networkReply); \
|
||||||
@ -674,11 +674,8 @@ void tst_QRestAccessManager::json()
|
|||||||
responseString = restReply.readText(); \
|
responseString = restReply.readText(); \
|
||||||
networkReply->deleteLater(); \
|
networkReply->deleteLater(); \
|
||||||
networkReply = nullptr; \
|
networkReply = nullptr; \
|
||||||
__VA_ARGS__ ; \
|
|
||||||
QCOMPARE(responseString, sourceString); \
|
QCOMPARE(responseString, sourceString); \
|
||||||
}
|
}
|
||||||
#define VERIFY_TEXT_REPLY_OK VERIFY_TEXT_REPLY_OK_IMPL(do {} while (false))
|
|
||||||
#define VERIFY_TEXT_REPLY_XFAIL(JIRA) VERIFY_TEXT_REPLY_OK_IMPL(QEXPECT_FAIL("", #JIRA, Continue))
|
|
||||||
|
|
||||||
#define VERIFY_TEXT_REPLY_ERROR(WARNING_MESSAGE) \
|
#define VERIFY_TEXT_REPLY_ERROR(WARNING_MESSAGE) \
|
||||||
{ \
|
{ \
|
||||||
@ -733,16 +730,12 @@ void tst_QRestAccessManager::text()
|
|||||||
// Successful UTF-8 (obfuscated)
|
// Successful UTF-8 (obfuscated)
|
||||||
serverSideResponse.headers["Content-Type:"_ba] = "text/plain; charset=\"UT\\F-8\""_ba;
|
serverSideResponse.headers["Content-Type:"_ba] = "text/plain; charset=\"UT\\F-8\""_ba;
|
||||||
serverSideResponse.body = encUTF8(sourceString);
|
serverSideResponse.body = encUTF8(sourceString);
|
||||||
#if QT_CONFIG(icu) // ICU ignores `\` during name lookup, making this test succeed when it shouldn't
|
|
||||||
VERIFY_TEXT_REPLY_OK;
|
VERIFY_TEXT_REPLY_OK;
|
||||||
#else
|
|
||||||
VERIFY_TEXT_REPLY_XFAIL(QTBUG-120307);
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// Successful UTF-8 (empty charset)
|
// Successful UTF-8 (empty charset)
|
||||||
serverSideResponse.headers["Content-Type:"_ba] = "text/plain; charset=\"\""_ba;
|
serverSideResponse.headers["Content-Type:"_ba] = "text/plain; charset=\"\""_ba;
|
||||||
serverSideResponse.body = encUTF8(sourceString);
|
serverSideResponse.body = encUTF8(sourceString);
|
||||||
VERIFY_TEXT_REPLY_XFAIL(QTBUG-120307);
|
VERIFY_TEXT_REPLY_OK;
|
||||||
|
|
||||||
// Successful UTF-8 (implicit)
|
// Successful UTF-8 (implicit)
|
||||||
serverSideResponse.headers["Content-Type:"_ba] = "text/plain"_ba;
|
serverSideResponse.headers["Content-Type:"_ba] = "text/plain"_ba;
|
||||||
@ -774,7 +767,7 @@ void tst_QRestAccessManager::text()
|
|||||||
serverSideResponse.headers.insert("Content-Type:"_ba,
|
serverSideResponse.headers.insert("Content-Type:"_ba,
|
||||||
"text/plain; extraparameter=bar;charset = \"UT\\F-32\""_ba);
|
"text/plain; extraparameter=bar;charset = \"UT\\F-32\""_ba);
|
||||||
serverSideResponse.body = encUTF32(sourceString);
|
serverSideResponse.body = encUTF32(sourceString);
|
||||||
VERIFY_TEXT_REPLY_XFAIL(QTBUG-120307);
|
VERIFY_TEXT_REPLY_OK;
|
||||||
|
|
||||||
{
|
{
|
||||||
// Unsuccessful UTF-32, wrong encoding indicated (indicated UTF-32 but data is UTF-8)
|
// Unsuccessful UTF-32, wrong encoding indicated (indicated UTF-32 but data is UTF-8)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user