QUrl IDNA: Update to Unicode 15.1
Unicode 15.1 (more spcifically UTS #46, revision 31) changes how host names are processed. The initial Unicode host name mapping is done without validity checking. That check was used in the past to mark QUrl's invalid. This patch inserts simplified validity check later. This check is similar to one performed before conversion to unicode, but does not include BiDi check to keep names starting with xn-- valid. Additional complication is that U+1E9E LATIN CAPITAL LETTER SHARP S must be mapped to "ss" with transitional processing. It is not possible anymore to predict whether Qt implementation considers a URL valid by using only error codes in the tests vectors file. The test was adjusted to expect an empty string (indicating invalid QUrl) or string matching the entry in vectors file if there are any processing errors specified for that entry. Unblacklist tst_QUrlUts46::idnaTestV2. Task-number: QTBUG-121529 Change-Id: Iad5dadd1a6695fa54b432e35000b350cd6e06341 Reviewed-by: Edward Welbourne <edward.welbourne@qt.io> Reviewed-by: Mårten Nordheim <marten.nordheim@qt.io>
This commit is contained in:
parent
bfd09ec38c
commit
b2764f7802
@ -423,13 +423,19 @@ static QString mapDomainName(const QString &in, QUrl::AceProcessingOptions optio
|
|||||||
if (uc >= U'A' && uc <= U'Z')
|
if (uc >= U'A' && uc <= U'Z')
|
||||||
uc |= 0x20; // lower-case it
|
uc |= 0x20; // lower-case it
|
||||||
|
|
||||||
if (!isValidInNormalizedAsciiName(uc))
|
if (isValidInNormalizedAsciiName(uc)) {
|
||||||
return {};
|
result.append(static_cast<char16_t>(uc));
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
result.append(static_cast<char16_t>(uc));
|
allAscii = false;
|
||||||
|
|
||||||
|
// Capital sharp S is a special case since UTR #46 revision 31 (Unicode 15.1)
|
||||||
|
if (uc == 0x1E9E && options.testFlag(QUrl::AceTransitionalProcessing)) {
|
||||||
|
result.append(u"ss"_s);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
allAscii = false;
|
|
||||||
|
|
||||||
QUnicodeTables::IdnaStatus status = QUnicodeTables::idnaStatus(uc);
|
QUnicodeTables::IdnaStatus status = QUnicodeTables::idnaStatus(uc);
|
||||||
|
|
||||||
@ -442,14 +448,13 @@ static QString mapDomainName(const QString &in, QUrl::AceProcessingOptions optio
|
|||||||
case QUnicodeTables::IdnaStatus::Ignored:
|
case QUnicodeTables::IdnaStatus::Ignored:
|
||||||
continue;
|
continue;
|
||||||
case QUnicodeTables::IdnaStatus::Valid:
|
case QUnicodeTables::IdnaStatus::Valid:
|
||||||
|
case QUnicodeTables::IdnaStatus::Disallowed:
|
||||||
for (auto c : QChar::fromUcs4(uc))
|
for (auto c : QChar::fromUcs4(uc))
|
||||||
result.append(c);
|
result.append(c);
|
||||||
break;
|
break;
|
||||||
case QUnicodeTables::IdnaStatus::Mapped:
|
case QUnicodeTables::IdnaStatus::Mapped:
|
||||||
result.append(QUnicodeTables::idnaMapping(uc));
|
result.append(QUnicodeTables::idnaMapping(uc));
|
||||||
break;
|
break;
|
||||||
case QUnicodeTables::IdnaStatus::Disallowed:
|
|
||||||
return {};
|
|
||||||
default:
|
default:
|
||||||
Q_UNREACHABLE();
|
Q_UNREACHABLE();
|
||||||
}
|
}
|
||||||
@ -483,12 +488,13 @@ class DomainValidityChecker
|
|||||||
{
|
{
|
||||||
bool domainNameIsBidi = false;
|
bool domainNameIsBidi = false;
|
||||||
bool hadBidiErrors = false;
|
bool hadBidiErrors = false;
|
||||||
|
bool ignoreBidiErrors;
|
||||||
|
|
||||||
static constexpr char32_t ZWNJ = U'\u200C';
|
static constexpr char32_t ZWNJ = U'\u200C';
|
||||||
static constexpr char32_t ZWJ = U'\u200D';
|
static constexpr char32_t ZWJ = U'\u200D';
|
||||||
|
|
||||||
public:
|
public:
|
||||||
DomainValidityChecker() { }
|
DomainValidityChecker(bool ignoreBidiErrors = false) : ignoreBidiErrors(ignoreBidiErrors) { }
|
||||||
bool checkLabel(const QString &label, QUrl::AceProcessingOptions options);
|
bool checkLabel(const QString &label, QUrl::AceProcessingOptions options);
|
||||||
|
|
||||||
private:
|
private:
|
||||||
@ -714,7 +720,7 @@ bool DomainValidityChecker::checkLabel(const QString &label, QUrl::AceProcessing
|
|||||||
// because non-BMP characters are unlikely to be used for specifying
|
// because non-BMP characters are unlikely to be used for specifying
|
||||||
// future extensions.
|
// future extensions.
|
||||||
if (label[2] == u'-' && label[3] == u'-')
|
if (label[2] == u'-' && label[3] == u'-')
|
||||||
return false;
|
return ignoreBidiErrors && label.startsWith(u"xn") && validateAsciiLabel(label);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (label.startsWith(u'-') || label.endsWith(u'-'))
|
if (label.startsWith(u'-') || label.endsWith(u'-'))
|
||||||
@ -736,7 +742,7 @@ bool DomainValidityChecker::checkLabel(const QString &label, QUrl::AceProcessing
|
|||||||
for (;;) {
|
for (;;) {
|
||||||
hasJoiners = hasJoiners || c == ZWNJ || c == ZWJ;
|
hasJoiners = hasJoiners || c == ZWNJ || c == ZWJ;
|
||||||
|
|
||||||
if (!domainNameIsBidi) {
|
if (!ignoreBidiErrors && !domainNameIsBidi) {
|
||||||
switch (QChar::direction(c)) {
|
switch (QChar::direction(c)) {
|
||||||
case QChar::DirR:
|
case QChar::DirR:
|
||||||
case QChar::DirAL:
|
case QChar::DirAL:
|
||||||
@ -784,17 +790,12 @@ static QString convertToAscii(QStringView normalizedDomain, AceLeadingDot dot)
|
|||||||
QString aceResult;
|
QString aceResult;
|
||||||
|
|
||||||
while (true) {
|
while (true) {
|
||||||
auto idx = normalizedDomain.indexOf(u'.', lastIdx);
|
qsizetype idx = normalizedDomain.indexOf(u'.', lastIdx);
|
||||||
if (idx == -1)
|
if (idx == -1)
|
||||||
idx = normalizedDomain.size();
|
idx = normalizedDomain.size();
|
||||||
|
|
||||||
const auto labelLength = idx - lastIdx;
|
const qsizetype labelLength = idx - lastIdx;
|
||||||
if (labelLength == 0) {
|
if (labelLength) {
|
||||||
if (idx == normalizedDomain.size())
|
|
||||||
break;
|
|
||||||
if (dot == ForbidLeadingDot || idx > 0)
|
|
||||||
return {}; // two delimiters in a row -- empty label not allowed
|
|
||||||
} else {
|
|
||||||
const auto label = normalizedDomain.sliced(lastIdx, labelLength);
|
const auto label = normalizedDomain.sliced(lastIdx, labelLength);
|
||||||
aceForm.clear();
|
aceForm.clear();
|
||||||
qt_punycodeEncoder(label, &aceForm);
|
qt_punycodeEncoder(label, &aceForm);
|
||||||
@ -807,6 +808,9 @@ static QString convertToAscii(QStringView normalizedDomain, AceLeadingDot dot)
|
|||||||
if (idx == normalizedDomain.size())
|
if (idx == normalizedDomain.size())
|
||||||
break;
|
break;
|
||||||
|
|
||||||
|
if (labelLength == 0 && (dot == ForbidLeadingDot || idx > 0))
|
||||||
|
return {}; // two delimiters in a row -- empty label not allowed
|
||||||
|
|
||||||
lastIdx = idx + 1;
|
lastIdx = idx + 1;
|
||||||
aceResult += u'.';
|
aceResult += u'.';
|
||||||
}
|
}
|
||||||
@ -886,6 +890,33 @@ static QString convertToUnicode(const QString &asciiDomain, QUrl::AceProcessingO
|
|||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static bool checkUnicodeName(const QString &domainName, QUrl::AceProcessingOptions options)
|
||||||
|
{
|
||||||
|
qsizetype lastIdx = 0;
|
||||||
|
|
||||||
|
DomainValidityChecker checker(true);
|
||||||
|
|
||||||
|
while (true) {
|
||||||
|
qsizetype idx = domainName.indexOf(u'.', lastIdx);
|
||||||
|
if (idx == -1)
|
||||||
|
idx = domainName.size();
|
||||||
|
|
||||||
|
const qsizetype labelLength = idx - lastIdx;
|
||||||
|
if (labelLength) {
|
||||||
|
const auto label = domainName.sliced(lastIdx, labelLength);
|
||||||
|
|
||||||
|
if (!checker.checkLabel(label, options))
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (idx == domainName.size())
|
||||||
|
break;
|
||||||
|
|
||||||
|
lastIdx = idx + 1;
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
QString qt_ACE_do(const QString &domain, AceOperation op, AceLeadingDot dot,
|
QString qt_ACE_do(const QString &domain, AceOperation op, AceLeadingDot dot,
|
||||||
QUrl::AceProcessingOptions options)
|
QUrl::AceProcessingOptions options)
|
||||||
{
|
{
|
||||||
@ -900,6 +931,9 @@ QString qt_ACE_do(const QString &domain, AceOperation op, AceLeadingDot dot,
|
|||||||
if (normalized.isEmpty())
|
if (normalized.isEmpty())
|
||||||
return {};
|
return {};
|
||||||
|
|
||||||
|
if (!mappedToAscii && !checkUnicodeName(normalized, options))
|
||||||
|
return {};
|
||||||
|
|
||||||
bool needsConversionToUnicode;
|
bool needsConversionToUnicode;
|
||||||
const QString aceResult = mappedToAscii ? normalized : convertToAscii(normalized, dot);
|
const QString aceResult = mappedToAscii ? normalized : convertToAscii(normalized, dot);
|
||||||
if (aceResult.isEmpty() || !checkAsciiDomainName(aceResult, dot, &needsConversionToUnicode))
|
if (aceResult.isEmpty() || !checkAsciiDomainName(aceResult, dot, &needsConversionToUnicode))
|
||||||
|
@ -1,3 +0,0 @@
|
|||||||
# QTBUG-121529: Tests need to be updated to Unicode 15.1
|
|
||||||
[idnaTestV2]
|
|
||||||
*
|
|
@ -16,11 +16,11 @@ private Q_SLOTS:
|
|||||||
void idnaTestV2();
|
void idnaTestV2();
|
||||||
|
|
||||||
private:
|
private:
|
||||||
// All error codes:
|
// All error codes in UTR #46 revision 31 (Unicode 15.1):
|
||||||
// A3, A4_1, A4_2,
|
// A4_1, A4_2,
|
||||||
// B1, B2, B3, B4, B5, B6,
|
// B1, B2, B3, B4, B5, B6,
|
||||||
// C1, C2,
|
// C1, C2,
|
||||||
// P1, P4,
|
// P4,
|
||||||
// V1, V2, V3, V5, V6,
|
// V1, V2, V3, V5, V6,
|
||||||
// X4_2
|
// X4_2
|
||||||
//
|
//
|
||||||
@ -28,7 +28,9 @@ private:
|
|||||||
static const QSet<QByteArray> fatalErrors;
|
static const QSet<QByteArray> fatalErrors;
|
||||||
};
|
};
|
||||||
|
|
||||||
const QSet<QByteArray> tst_QUrlUts46::fatalErrors = { "A3", "A4_2", "P1", "X4_2" };
|
const QSet<QByteArray> tst_QUrlUts46::fatalErrors = {
|
||||||
|
"A4_2", // Empty ASCII label
|
||||||
|
};
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Replace \uXXXX escapes in test case fields.
|
* Replace \uXXXX escapes in test case fields.
|
||||||
@ -124,22 +126,19 @@ void tst_QUrlUts46::idnaTestV2()
|
|||||||
QFETCH(QString, toAsciiT);
|
QFETCH(QString, toAsciiT);
|
||||||
QFETCH(bool, toAsciiTOk);
|
QFETCH(bool, toAsciiTOk);
|
||||||
|
|
||||||
auto dashesOk = [](const QString &domain) {
|
|
||||||
const auto labels = domain.split(u'.');
|
|
||||||
return std::all_of(labels.begin(), labels.end(), [](const QString &label) {
|
|
||||||
return label.isEmpty() || !(label.startsWith(u'-') || label.endsWith(u'-'));
|
|
||||||
});
|
|
||||||
};
|
|
||||||
|
|
||||||
QString toAceN = QUrl::toAce(source);
|
QString toAceN = QUrl::toAce(source);
|
||||||
if (toAsciiNOk && dashesOk(toAsciiN))
|
if (toUnicodeOk && toAsciiNOk)
|
||||||
QCOMPARE(toAceN, toAsciiN);
|
QCOMPARE(toAceN, toAsciiN);
|
||||||
|
else if (toAsciiNOk)
|
||||||
|
QVERIFY(toAceN.isEmpty() || toAceN == toAsciiN);
|
||||||
else
|
else
|
||||||
QCOMPARE(toAceN, QString());
|
QCOMPARE(toAceN, QString());
|
||||||
|
|
||||||
QString toAceT = QUrl::toAce(source, QUrl::AceTransitionalProcessing);
|
QString toAceT = QUrl::toAce(source, QUrl::AceTransitionalProcessing);
|
||||||
if (toAsciiTOk && dashesOk(toAsciiT))
|
if (toUnicodeOk && toAsciiTOk)
|
||||||
QCOMPARE(toAceT, toAsciiT);
|
QCOMPARE(toAceT, toAsciiT);
|
||||||
|
else if (toAsciiTOk)
|
||||||
|
QVERIFY(toAceT.isEmpty() || toAceT == toAsciiT);
|
||||||
else
|
else
|
||||||
QCOMPARE(toAceT, QString());
|
QCOMPARE(toAceT, QString());
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user