unicode: Generate tables for IDNA/UTS #46
Update the Unicode data processing tool to generate properties and mapping tables needed to implement UTS #46 (https://unicode.org/reports/tr46/). The implementation extends the standard to allow usage of underscores in URLs. This is done for compatibility with DNS-SD and SMB protocols. The data file needed to generate the new properties was taken from https://www.unicode.org/Public/idna/13.0.0/IdnaMappingTable.txt Task-number: QTBUG-85323 Change-Id: I2c303bf8a08aefb18a7491fb9b55385563bfa219 Reviewed-by: Edward Welbourne <edward.welbourne@qt.io>
This commit is contained in:
parent
0dbf73e3de
commit
2afe1a3c19
@ -12,6 +12,8 @@ To update:
|
|||||||
Unicode version
|
Unicode version
|
||||||
* In that case, also update main.cpp's initAgeMap and DATA_VERSION_S*
|
* In that case, also update main.cpp's initAgeMap and DATA_VERSION_S*
|
||||||
to match
|
to match
|
||||||
|
* Download https://www.unicode.org/Public/idna/$Version/IdnaMappingTable.txt
|
||||||
|
and put it into data/.
|
||||||
* Build this project. Its binary, unicode, ignores command-line
|
* Build this project. Its binary, unicode, ignores command-line
|
||||||
options and assumes it is being run from this directory. When run,
|
options and assumes it is being run from this directory. When run,
|
||||||
it produces lots of output. If it gets as far as updating
|
it produces lots of output. If it gets as far as updating
|
||||||
|
8727
util/unicode/data/IdnaMappingTable.txt
Normal file
8727
util/unicode/data/IdnaMappingTable.txt
Normal file
File diff suppressed because it is too large
Load Diff
@ -805,6 +805,59 @@ static void initScriptMap()
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// IDNA status as present int the data file
|
||||||
|
enum class IdnaRawStatus : unsigned int {
|
||||||
|
Disallowed,
|
||||||
|
Valid,
|
||||||
|
Ignored,
|
||||||
|
Mapped,
|
||||||
|
Deviation,
|
||||||
|
DisallowedStd3Valid,
|
||||||
|
DisallowedStd3Mapped,
|
||||||
|
};
|
||||||
|
|
||||||
|
static QHash<QByteArray, IdnaRawStatus> idnaStatusMap;
|
||||||
|
|
||||||
|
static void initIdnaStatusMap()
|
||||||
|
{
|
||||||
|
struct {
|
||||||
|
IdnaRawStatus status;
|
||||||
|
const char *name;
|
||||||
|
} data[] = {
|
||||||
|
{IdnaRawStatus::Disallowed, "disallowed"},
|
||||||
|
{IdnaRawStatus::Valid, "valid"},
|
||||||
|
{IdnaRawStatus::Ignored, "ignored"},
|
||||||
|
{IdnaRawStatus::Mapped, "mapped"},
|
||||||
|
{IdnaRawStatus::Deviation, "deviation"},
|
||||||
|
{IdnaRawStatus::DisallowedStd3Valid, "disallowed_STD3_valid"},
|
||||||
|
{IdnaRawStatus::DisallowedStd3Mapped, "disallowed_STD3_mapped"},
|
||||||
|
};
|
||||||
|
|
||||||
|
for (const auto &entry : data)
|
||||||
|
idnaStatusMap[entry.name] = entry.status;
|
||||||
|
}
|
||||||
|
|
||||||
|
static const char *idna_status_string =
|
||||||
|
"enum class IdnaStatus : unsigned int {\n"
|
||||||
|
" Disallowed,\n"
|
||||||
|
" Valid,\n"
|
||||||
|
" Ignored,\n"
|
||||||
|
" Mapped,\n"
|
||||||
|
" Deviation\n"
|
||||||
|
"};\n\n";
|
||||||
|
|
||||||
|
// Resolved IDNA status as it goes into the database.
|
||||||
|
// Qt extends host name validity rules to allow underscores
|
||||||
|
// NOTE: The members here should come in the same order and have the same values
|
||||||
|
// as in IdnaRawStatus
|
||||||
|
enum class IdnaStatus : unsigned int {
|
||||||
|
Disallowed,
|
||||||
|
Valid,
|
||||||
|
Ignored,
|
||||||
|
Mapped,
|
||||||
|
Deviation,
|
||||||
|
};
|
||||||
|
|
||||||
// Keep this one in sync with the code in createPropertyInfo
|
// Keep this one in sync with the code in createPropertyInfo
|
||||||
static const char *property_string =
|
static const char *property_string =
|
||||||
"enum Case {\n"
|
"enum Case {\n"
|
||||||
@ -838,7 +891,8 @@ static const char *property_string =
|
|||||||
" ushort graphemeBreakClass : 5; /* 5 used */\n"
|
" ushort graphemeBreakClass : 5; /* 5 used */\n"
|
||||||
" ushort wordBreakClass : 5; /* 5 used */\n"
|
" ushort wordBreakClass : 5; /* 5 used */\n"
|
||||||
" ushort lineBreakClass : 6; /* 6 used */\n"
|
" ushort lineBreakClass : 6; /* 6 used */\n"
|
||||||
" ushort sentenceBreakClass : 8; /* 4 used */\n"
|
" ushort sentenceBreakClass : 4; /* 4 used */\n"
|
||||||
|
" ushort idnaStatus : 4; /* 3 used */\n"
|
||||||
" ushort script : 8;\n"
|
" ushort script : 8;\n"
|
||||||
"};\n\n"
|
"};\n\n"
|
||||||
"Q_CORE_EXPORT const Properties * QT_FASTCALL properties(char32_t ucs4) noexcept;\n"
|
"Q_CORE_EXPORT const Properties * QT_FASTCALL properties(char32_t ucs4) noexcept;\n"
|
||||||
@ -861,6 +915,14 @@ static const char *methods =
|
|||||||
"Q_CORE_EXPORT LineBreakClass QT_FASTCALL lineBreakClass(char32_t ucs4) noexcept;\n"
|
"Q_CORE_EXPORT LineBreakClass QT_FASTCALL lineBreakClass(char32_t ucs4) noexcept;\n"
|
||||||
"inline LineBreakClass lineBreakClass(QChar ch) noexcept\n"
|
"inline LineBreakClass lineBreakClass(QChar ch) noexcept\n"
|
||||||
"{ return lineBreakClass(ch.unicode()); }\n"
|
"{ return lineBreakClass(ch.unicode()); }\n"
|
||||||
|
"\n"
|
||||||
|
"Q_CORE_EXPORT IdnaStatus QT_FASTCALL idnaStatus(char32_t ucs4) noexcept;\n"
|
||||||
|
"inline IdnaStatus idnaStatus(QChar ch) noexcept\n"
|
||||||
|
"{ return idnaStatus(ch.unicode()); }\n"
|
||||||
|
"\n"
|
||||||
|
"Q_CORE_EXPORT const char16_t * QT_FASTCALL idnaMapping(char32_t usc4) noexcept;\n"
|
||||||
|
"inline const char16_t *idnaMapping(QChar ch) noexcept\n"
|
||||||
|
"{ return idnaMapping(ch.unicode()); }\n"
|
||||||
"\n";
|
"\n";
|
||||||
|
|
||||||
static const int SizeOfPropertiesStruct = 20;
|
static const int SizeOfPropertiesStruct = 20;
|
||||||
@ -899,6 +961,7 @@ struct PropertyFlags {
|
|||||||
&& lineBreakClass == o.lineBreakClass
|
&& lineBreakClass == o.lineBreakClass
|
||||||
&& script == o.script
|
&& script == o.script
|
||||||
&& nfQuickCheck == o.nfQuickCheck
|
&& nfQuickCheck == o.nfQuickCheck
|
||||||
|
&& idnaStatus == o.idnaStatus
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
// from UnicodeData.txt
|
// from UnicodeData.txt
|
||||||
@ -928,6 +991,7 @@ struct PropertyFlags {
|
|||||||
int script = QChar::Script_Unknown;
|
int script = QChar::Script_Unknown;
|
||||||
// from DerivedNormalizationProps.txt
|
// from DerivedNormalizationProps.txt
|
||||||
uchar nfQuickCheck = 0;
|
uchar nfQuickCheck = 0;
|
||||||
|
IdnaStatus idnaStatus = IdnaStatus::Disallowed;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
@ -1082,6 +1146,8 @@ struct UnicodeData {
|
|||||||
|
|
||||||
// computed position of unicode property set
|
// computed position of unicode property set
|
||||||
int propertyIndex = -1;
|
int propertyIndex = -1;
|
||||||
|
|
||||||
|
IdnaRawStatus idnaRawStatus = IdnaRawStatus::Disallowed;
|
||||||
};
|
};
|
||||||
|
|
||||||
static QList<UnicodeData> unicodeData;
|
static QList<UnicodeData> unicodeData;
|
||||||
@ -2292,6 +2358,194 @@ static void readScripts()
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static QMap<char32_t, QList<char32_t>> idnaMappingTable;
|
||||||
|
|
||||||
|
static void readIdnaMappingTable()
|
||||||
|
{
|
||||||
|
qDebug("Reading IdnaMappingTable.txt");
|
||||||
|
|
||||||
|
QFile f("data/IdnaMappingTable.txt");
|
||||||
|
if (!f.exists() || !f.open(QFile::ReadOnly))
|
||||||
|
qFatal("Couldn't find or read IdnaMappingTable.txt");
|
||||||
|
|
||||||
|
while (!f.atEnd()) {
|
||||||
|
QByteArray line = f.readLine().trimmed();
|
||||||
|
|
||||||
|
int comment = line.indexOf('#');
|
||||||
|
line = (comment < 0 ? line : line.left(comment)).simplified();
|
||||||
|
|
||||||
|
if (line.isEmpty())
|
||||||
|
continue;
|
||||||
|
|
||||||
|
QList<QByteArray> fields = line.split(';');
|
||||||
|
Q_ASSERT(fields.size() >= 2);
|
||||||
|
|
||||||
|
// That would be split(".."), but that API does not exist.
|
||||||
|
const QByteArray codePoints = fields[0].trimmed().replace("..", ".");
|
||||||
|
QList<QByteArray> cl = codePoints.split('.');
|
||||||
|
Q_ASSERT(cl.size() >= 1 && cl.size() <= 2);
|
||||||
|
|
||||||
|
const QByteArray statusString = fields[1].trimmed();
|
||||||
|
if (!idnaStatusMap.contains(statusString))
|
||||||
|
qFatal("Unhandled IDNA status property value for %s: %s",
|
||||||
|
qPrintable(codePoints), qPrintable(statusString));
|
||||||
|
IdnaRawStatus rawStatus = idnaStatusMap.value(statusString);
|
||||||
|
|
||||||
|
bool ok;
|
||||||
|
const int first = cl[0].toInt(&ok, 16);
|
||||||
|
const int last = ok && cl.size() == 2 ? cl[1].toInt(&ok, 16) : first;
|
||||||
|
Q_ASSERT(ok);
|
||||||
|
|
||||||
|
QList<char32_t> mapping;
|
||||||
|
|
||||||
|
switch (rawStatus) {
|
||||||
|
case IdnaRawStatus::Disallowed:
|
||||||
|
case IdnaRawStatus::Valid:
|
||||||
|
case IdnaRawStatus::Ignored:
|
||||||
|
case IdnaRawStatus::DisallowedStd3Valid:
|
||||||
|
break;
|
||||||
|
|
||||||
|
case IdnaRawStatus::Mapped:
|
||||||
|
case IdnaRawStatus::Deviation:
|
||||||
|
case IdnaRawStatus::DisallowedStd3Mapped:
|
||||||
|
Q_ASSERT(fields.size() >= 3);
|
||||||
|
|
||||||
|
for (const auto &s : fields[2].trimmed().split(' ')) {
|
||||||
|
if (!s.isEmpty()) {
|
||||||
|
bool ok;
|
||||||
|
int val = s.toInt(&ok, 16);
|
||||||
|
Q_ASSERT_X(ok, "readIdnaMappingTable", qPrintable(line));
|
||||||
|
mapping.append(val);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Some deviations have empty mappings, others should not...
|
||||||
|
if (mapping.isEmpty()) {
|
||||||
|
Q_ASSERT(rawStatus == IdnaRawStatus::Deviation);
|
||||||
|
qDebug() << " Empty IDNA mapping for" << codePoints;
|
||||||
|
}
|
||||||
|
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int codepoint = first; codepoint <= last; ++codepoint) {
|
||||||
|
UnicodeData &ud = UnicodeData::valueRef(codepoint);
|
||||||
|
// Ensure that ranges don't overlap.
|
||||||
|
Q_ASSERT(ud.idnaRawStatus == IdnaRawStatus::Disallowed);
|
||||||
|
ud.idnaRawStatus = rawStatus;
|
||||||
|
|
||||||
|
// ASCII codepoints are skipped here because they are processed in separate
|
||||||
|
// optimized code paths that do not use this mapping table.
|
||||||
|
if (codepoint >= 0x80 && !mapping.isEmpty())
|
||||||
|
idnaMappingTable[codepoint] = mapping;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
Resolve IDNA status by deciding whether to allow STD3 violations
|
||||||
|
|
||||||
|
Underscores are normally prohibited by STD3 rules but Qt allows underscores
|
||||||
|
to be used inside URLs (see QTBUG-7434 for example). This code changes the
|
||||||
|
underscore status to Valid. The same is done to mapped codepoints that
|
||||||
|
map to underscores combined with other Valid codepoints.
|
||||||
|
|
||||||
|
Underscores in domain names are required when using DNS-SD protocol and they
|
||||||
|
are also allowed by the SMB protocol.
|
||||||
|
*/
|
||||||
|
static void resolveIdnaStatus()
|
||||||
|
{
|
||||||
|
qDebug("resolveIdnaStatus:");
|
||||||
|
|
||||||
|
UnicodeData::valueRef(u'_').idnaRawStatus = IdnaRawStatus::Valid;
|
||||||
|
|
||||||
|
for (int codepoint = 0; codepoint <= QChar::LastValidCodePoint; ++codepoint) {
|
||||||
|
UnicodeData &ud = UnicodeData::valueRef(codepoint);
|
||||||
|
switch (ud.idnaRawStatus) {
|
||||||
|
case IdnaRawStatus::Disallowed:
|
||||||
|
case IdnaRawStatus::Valid:
|
||||||
|
case IdnaRawStatus::Ignored:
|
||||||
|
case IdnaRawStatus::Deviation:
|
||||||
|
case IdnaRawStatus::Mapped:
|
||||||
|
ud.p.idnaStatus = static_cast<IdnaStatus>(ud.idnaRawStatus);
|
||||||
|
break;
|
||||||
|
case IdnaRawStatus::DisallowedStd3Valid:
|
||||||
|
ud.p.idnaStatus = IdnaStatus::Disallowed;
|
||||||
|
break;
|
||||||
|
case IdnaRawStatus::DisallowedStd3Mapped: {
|
||||||
|
Q_ASSERT(idnaMappingTable.contains(codepoint));
|
||||||
|
const auto &mapping = idnaMappingTable[codepoint];
|
||||||
|
|
||||||
|
bool allow = std::all_of(mapping.begin(), mapping.end(), [](auto c) {
|
||||||
|
return UnicodeData::valueRef(c).idnaRawStatus == IdnaRawStatus::Valid;
|
||||||
|
});
|
||||||
|
|
||||||
|
if (allow) {
|
||||||
|
qDebug() << " Allowing" << Qt::hex << codepoint;
|
||||||
|
ud.p.idnaStatus = IdnaStatus::Mapped;
|
||||||
|
} else {
|
||||||
|
ud.p.idnaStatus = IdnaStatus::Disallowed;
|
||||||
|
idnaMappingTable.remove(codepoint);
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static QByteArray createIdnaMapping()
|
||||||
|
{
|
||||||
|
qDebug("createIdnaMapping:");
|
||||||
|
|
||||||
|
size_t maxMappingLength = 0;
|
||||||
|
|
||||||
|
for (const auto &entry : idnaMappingTable) {
|
||||||
|
size_t length = 0;
|
||||||
|
for (char32_t c : entry)
|
||||||
|
length += QChar::requiresSurrogates(c) ? 2 : 1;
|
||||||
|
maxMappingLength = qMax(maxMappingLength, length);
|
||||||
|
}
|
||||||
|
|
||||||
|
qDebug() << " max mapping length:" << maxMappingLength;
|
||||||
|
qsizetype memoryUsage = 0;
|
||||||
|
QByteArray out =
|
||||||
|
"struct IdnaMapEntry {\n"
|
||||||
|
" char32_t codePoint;\n"
|
||||||
|
" char16_t mapping[" + QByteArray::number(maxMappingLength + 1) + "];\n"
|
||||||
|
"};\n\n"
|
||||||
|
"static const IdnaMapEntry idnaMap[] = {\n";
|
||||||
|
|
||||||
|
for (auto i = idnaMappingTable.keyValueBegin(); i != idnaMappingTable.keyValueEnd(); i++) {
|
||||||
|
out += " { 0x" + QByteArray::number(i->first, 16) + ", {";
|
||||||
|
size_t n = 0;
|
||||||
|
for (char32_t c : i->second) {
|
||||||
|
for (auto qc : QChar::fromUcs4(c)) {
|
||||||
|
out += "0x" + QByteArray::number(qc, 16) + ", ";
|
||||||
|
n++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for (; n < maxMappingLength; n++)
|
||||||
|
out += "0, ";
|
||||||
|
out += "0 }},\n";
|
||||||
|
memoryUsage += 4 + 2 * (maxMappingLength + 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
qDebug() << " memory usage:" << memoryUsage << "bytes";
|
||||||
|
|
||||||
|
out +=
|
||||||
|
"};\n\n"
|
||||||
|
"Q_CORE_EXPORT const char16_t * QT_FASTCALL idnaMapping(char32_t ucs4) noexcept\n"
|
||||||
|
"{\n"
|
||||||
|
" auto i = std::lower_bound(std::begin(idnaMap), std::end(idnaMap), ucs4,\n"
|
||||||
|
" [](const auto &p, char32_t c) { return p.codePoint < c; });\n"
|
||||||
|
" if (i != std::end(idnaMap) && i->codePoint == ucs4)\n"
|
||||||
|
" return i->mapping;\n"
|
||||||
|
" return nullptr;\n"
|
||||||
|
"}\n\n";
|
||||||
|
|
||||||
|
return out;
|
||||||
|
}
|
||||||
|
|
||||||
#if 0
|
#if 0
|
||||||
static void dump(int from, int to)
|
static void dump(int from, int to)
|
||||||
{
|
{
|
||||||
@ -2532,9 +2786,12 @@ static QByteArray createPropertyInfo()
|
|||||||
out += ", ";
|
out += ", ";
|
||||||
out += QByteArray::number( p.lineBreakClass );
|
out += QByteArray::number( p.lineBreakClass );
|
||||||
out += ", ";
|
out += ", ";
|
||||||
// " ushort sentenceBreakClass : 8; /* 4 used */\n"
|
// " ushort sentenceBreakClass : 4; /* 4 used */\n"
|
||||||
out += QByteArray::number( p.sentenceBreakClass );
|
out += QByteArray::number( p.sentenceBreakClass );
|
||||||
out += ", ";
|
out += ", ";
|
||||||
|
// " ushort idnaStatus : 4; /* 3 used */\n"
|
||||||
|
out += QByteArray::number( static_cast<unsigned int>(p.idnaStatus) );
|
||||||
|
out += ", ";
|
||||||
// " ushort script : 8;\n"
|
// " ushort script : 8;\n"
|
||||||
out += QByteArray::number( p.script );
|
out += QByteArray::number( p.script );
|
||||||
out += " },";
|
out += " },";
|
||||||
@ -2595,6 +2852,11 @@ static QByteArray createPropertyInfo()
|
|||||||
"{\n"
|
"{\n"
|
||||||
" return static_cast<LineBreakClass>(qGetProp(ucs4)->lineBreakClass);\n"
|
" return static_cast<LineBreakClass>(qGetProp(ucs4)->lineBreakClass);\n"
|
||||||
"}\n"
|
"}\n"
|
||||||
|
"\n"
|
||||||
|
"Q_CORE_EXPORT IdnaStatus QT_FASTCALL idnaStatus(char32_t ucs4) noexcept\n"
|
||||||
|
"{\n"
|
||||||
|
" return static_cast<IdnaStatus>(qGetProp(ucs4)->idnaStatus);\n"
|
||||||
|
"}\n"
|
||||||
"\n";
|
"\n";
|
||||||
|
|
||||||
return out;
|
return out;
|
||||||
@ -3059,6 +3321,7 @@ int main(int, char **)
|
|||||||
initSentenceBreak();
|
initSentenceBreak();
|
||||||
initLineBreak();
|
initLineBreak();
|
||||||
initScriptMap();
|
initScriptMap();
|
||||||
|
initIdnaStatusMap();
|
||||||
|
|
||||||
readUnicodeData();
|
readUnicodeData();
|
||||||
readBidiMirroring();
|
readBidiMirroring();
|
||||||
@ -3074,6 +3337,9 @@ int main(int, char **)
|
|||||||
readWordBreak();
|
readWordBreak();
|
||||||
readSentenceBreak();
|
readSentenceBreak();
|
||||||
readLineBreak();
|
readLineBreak();
|
||||||
|
readIdnaMappingTable();
|
||||||
|
|
||||||
|
resolveIdnaStatus();
|
||||||
|
|
||||||
computeUniqueProperties();
|
computeUniqueProperties();
|
||||||
QByteArray properties = createPropertyInfo();
|
QByteArray properties = createPropertyInfo();
|
||||||
@ -3081,6 +3347,7 @@ int main(int, char **)
|
|||||||
QByteArray compositions = createCompositionInfo();
|
QByteArray compositions = createCompositionInfo();
|
||||||
QByteArray ligatures = createLigatureInfo();
|
QByteArray ligatures = createLigatureInfo();
|
||||||
QByteArray normalizationCorrections = createNormalizationCorrections();
|
QByteArray normalizationCorrections = createNormalizationCorrections();
|
||||||
|
QByteArray idnaMapping = createIdnaMapping();
|
||||||
|
|
||||||
QByteArray header =
|
QByteArray header =
|
||||||
"/****************************************************************************\n"
|
"/****************************************************************************\n"
|
||||||
@ -3150,6 +3417,7 @@ int main(int, char **)
|
|||||||
f.write(ligatures);
|
f.write(ligatures);
|
||||||
f.write("\n");
|
f.write("\n");
|
||||||
f.write(normalizationCorrections);
|
f.write(normalizationCorrections);
|
||||||
|
f.write(idnaMapping);
|
||||||
f.write("} // namespace QUnicodeTables\n\n");
|
f.write("} // namespace QUnicodeTables\n\n");
|
||||||
f.write("using namespace QUnicodeTables;\n\n");
|
f.write("using namespace QUnicodeTables;\n\n");
|
||||||
f.write("QT_END_NAMESPACE\n");
|
f.write("QT_END_NAMESPACE\n");
|
||||||
@ -3173,6 +3441,7 @@ int main(int, char **)
|
|||||||
f.write(word_break_class_string);
|
f.write(word_break_class_string);
|
||||||
f.write(sentence_break_class_string);
|
f.write(sentence_break_class_string);
|
||||||
f.write(line_break_class_string);
|
f.write(line_break_class_string);
|
||||||
|
f.write(idna_status_string);
|
||||||
f.write(methods);
|
f.write(methods);
|
||||||
f.write("} // namespace QUnicodeTables\n\n"
|
f.write("} // namespace QUnicodeTables\n\n"
|
||||||
"QT_END_NAMESPACE\n\n"
|
"QT_END_NAMESPACE\n\n"
|
||||||
|
Loading…
x
Reference in New Issue
Block a user