unicode: Generate tables for IDNA/UTS #46

Update the Unicode data processing tool to generate properties
and mapping tables needed to implement UTS #46
(https://unicode.org/reports/tr46/). The implementation extends
the standard to allow usage of underscores in URLs. This is done
for compatibility with DNS-SD and SMB protocols.

The data file needed to generate the new properties was taken from
https://www.unicode.org/Public/idna/13.0.0/IdnaMappingTable.txt

Task-number: QTBUG-85323
Change-Id: I2c303bf8a08aefb18a7491fb9b55385563bfa219
Reviewed-by: Edward Welbourne <edward.welbourne@qt.io>
This commit is contained in:
Ievgenii Meshcheriakov 2021-07-30 12:09:46 +02:00
parent 0dbf73e3de
commit 2afe1a3c19
3 changed files with 9000 additions and 2 deletions

View File

@ -12,6 +12,8 @@ To update:
Unicode version Unicode version
* In that case, also update main.cpp's initAgeMap and DATA_VERSION_S* * In that case, also update main.cpp's initAgeMap and DATA_VERSION_S*
to match to match
* Download https://www.unicode.org/Public/idna/$Version/IdnaMappingTable.txt
and put it into data/.
* Build this project. Its binary, unicode, ignores command-line * Build this project. Its binary, unicode, ignores command-line
options and assumes it is being run from this directory. When run, options and assumes it is being run from this directory. When run,
it produces lots of output. If it gets as far as updating it produces lots of output. If it gets as far as updating

File diff suppressed because it is too large Load Diff

View File

@ -805,6 +805,59 @@ static void initScriptMap()
} }
} }
// IDNA status as present int the data file
enum class IdnaRawStatus : unsigned int {
Disallowed,
Valid,
Ignored,
Mapped,
Deviation,
DisallowedStd3Valid,
DisallowedStd3Mapped,
};
static QHash<QByteArray, IdnaRawStatus> idnaStatusMap;
static void initIdnaStatusMap()
{
struct {
IdnaRawStatus status;
const char *name;
} data[] = {
{IdnaRawStatus::Disallowed, "disallowed"},
{IdnaRawStatus::Valid, "valid"},
{IdnaRawStatus::Ignored, "ignored"},
{IdnaRawStatus::Mapped, "mapped"},
{IdnaRawStatus::Deviation, "deviation"},
{IdnaRawStatus::DisallowedStd3Valid, "disallowed_STD3_valid"},
{IdnaRawStatus::DisallowedStd3Mapped, "disallowed_STD3_mapped"},
};
for (const auto &entry : data)
idnaStatusMap[entry.name] = entry.status;
}
static const char *idna_status_string =
"enum class IdnaStatus : unsigned int {\n"
" Disallowed,\n"
" Valid,\n"
" Ignored,\n"
" Mapped,\n"
" Deviation\n"
"};\n\n";
// Resolved IDNA status as it goes into the database.
// Qt extends host name validity rules to allow underscores
// NOTE: The members here should come in the same order and have the same values
// as in IdnaRawStatus
enum class IdnaStatus : unsigned int {
Disallowed,
Valid,
Ignored,
Mapped,
Deviation,
};
// Keep this one in sync with the code in createPropertyInfo // Keep this one in sync with the code in createPropertyInfo
static const char *property_string = static const char *property_string =
"enum Case {\n" "enum Case {\n"
@ -838,7 +891,8 @@ static const char *property_string =
" ushort graphemeBreakClass : 5; /* 5 used */\n" " ushort graphemeBreakClass : 5; /* 5 used */\n"
" ushort wordBreakClass : 5; /* 5 used */\n" " ushort wordBreakClass : 5; /* 5 used */\n"
" ushort lineBreakClass : 6; /* 6 used */\n" " ushort lineBreakClass : 6; /* 6 used */\n"
" ushort sentenceBreakClass : 8; /* 4 used */\n" " ushort sentenceBreakClass : 4; /* 4 used */\n"
" ushort idnaStatus : 4; /* 3 used */\n"
" ushort script : 8;\n" " ushort script : 8;\n"
"};\n\n" "};\n\n"
"Q_CORE_EXPORT const Properties * QT_FASTCALL properties(char32_t ucs4) noexcept;\n" "Q_CORE_EXPORT const Properties * QT_FASTCALL properties(char32_t ucs4) noexcept;\n"
@ -861,6 +915,14 @@ static const char *methods =
"Q_CORE_EXPORT LineBreakClass QT_FASTCALL lineBreakClass(char32_t ucs4) noexcept;\n" "Q_CORE_EXPORT LineBreakClass QT_FASTCALL lineBreakClass(char32_t ucs4) noexcept;\n"
"inline LineBreakClass lineBreakClass(QChar ch) noexcept\n" "inline LineBreakClass lineBreakClass(QChar ch) noexcept\n"
"{ return lineBreakClass(ch.unicode()); }\n" "{ return lineBreakClass(ch.unicode()); }\n"
"\n"
"Q_CORE_EXPORT IdnaStatus QT_FASTCALL idnaStatus(char32_t ucs4) noexcept;\n"
"inline IdnaStatus idnaStatus(QChar ch) noexcept\n"
"{ return idnaStatus(ch.unicode()); }\n"
"\n"
"Q_CORE_EXPORT const char16_t * QT_FASTCALL idnaMapping(char32_t usc4) noexcept;\n"
"inline const char16_t *idnaMapping(QChar ch) noexcept\n"
"{ return idnaMapping(ch.unicode()); }\n"
"\n"; "\n";
static const int SizeOfPropertiesStruct = 20; static const int SizeOfPropertiesStruct = 20;
@ -899,6 +961,7 @@ struct PropertyFlags {
&& lineBreakClass == o.lineBreakClass && lineBreakClass == o.lineBreakClass
&& script == o.script && script == o.script
&& nfQuickCheck == o.nfQuickCheck && nfQuickCheck == o.nfQuickCheck
&& idnaStatus == o.idnaStatus
); );
} }
// from UnicodeData.txt // from UnicodeData.txt
@ -928,6 +991,7 @@ struct PropertyFlags {
int script = QChar::Script_Unknown; int script = QChar::Script_Unknown;
// from DerivedNormalizationProps.txt // from DerivedNormalizationProps.txt
uchar nfQuickCheck = 0; uchar nfQuickCheck = 0;
IdnaStatus idnaStatus = IdnaStatus::Disallowed;
}; };
@ -1082,6 +1146,8 @@ struct UnicodeData {
// computed position of unicode property set // computed position of unicode property set
int propertyIndex = -1; int propertyIndex = -1;
IdnaRawStatus idnaRawStatus = IdnaRawStatus::Disallowed;
}; };
static QList<UnicodeData> unicodeData; static QList<UnicodeData> unicodeData;
@ -2292,6 +2358,194 @@ static void readScripts()
} }
} }
static QMap<char32_t, QList<char32_t>> idnaMappingTable;
static void readIdnaMappingTable()
{
qDebug("Reading IdnaMappingTable.txt");
QFile f("data/IdnaMappingTable.txt");
if (!f.exists() || !f.open(QFile::ReadOnly))
qFatal("Couldn't find or read IdnaMappingTable.txt");
while (!f.atEnd()) {
QByteArray line = f.readLine().trimmed();
int comment = line.indexOf('#');
line = (comment < 0 ? line : line.left(comment)).simplified();
if (line.isEmpty())
continue;
QList<QByteArray> fields = line.split(';');
Q_ASSERT(fields.size() >= 2);
// That would be split(".."), but that API does not exist.
const QByteArray codePoints = fields[0].trimmed().replace("..", ".");
QList<QByteArray> cl = codePoints.split('.');
Q_ASSERT(cl.size() >= 1 && cl.size() <= 2);
const QByteArray statusString = fields[1].trimmed();
if (!idnaStatusMap.contains(statusString))
qFatal("Unhandled IDNA status property value for %s: %s",
qPrintable(codePoints), qPrintable(statusString));
IdnaRawStatus rawStatus = idnaStatusMap.value(statusString);
bool ok;
const int first = cl[0].toInt(&ok, 16);
const int last = ok && cl.size() == 2 ? cl[1].toInt(&ok, 16) : first;
Q_ASSERT(ok);
QList<char32_t> mapping;
switch (rawStatus) {
case IdnaRawStatus::Disallowed:
case IdnaRawStatus::Valid:
case IdnaRawStatus::Ignored:
case IdnaRawStatus::DisallowedStd3Valid:
break;
case IdnaRawStatus::Mapped:
case IdnaRawStatus::Deviation:
case IdnaRawStatus::DisallowedStd3Mapped:
Q_ASSERT(fields.size() >= 3);
for (const auto &s : fields[2].trimmed().split(' ')) {
if (!s.isEmpty()) {
bool ok;
int val = s.toInt(&ok, 16);
Q_ASSERT_X(ok, "readIdnaMappingTable", qPrintable(line));
mapping.append(val);
}
}
// Some deviations have empty mappings, others should not...
if (mapping.isEmpty()) {
Q_ASSERT(rawStatus == IdnaRawStatus::Deviation);
qDebug() << " Empty IDNA mapping for" << codePoints;
}
break;
}
for (int codepoint = first; codepoint <= last; ++codepoint) {
UnicodeData &ud = UnicodeData::valueRef(codepoint);
// Ensure that ranges don't overlap.
Q_ASSERT(ud.idnaRawStatus == IdnaRawStatus::Disallowed);
ud.idnaRawStatus = rawStatus;
// ASCII codepoints are skipped here because they are processed in separate
// optimized code paths that do not use this mapping table.
if (codepoint >= 0x80 && !mapping.isEmpty())
idnaMappingTable[codepoint] = mapping;
}
}
}
/*
Resolve IDNA status by deciding whether to allow STD3 violations
Underscores are normally prohibited by STD3 rules but Qt allows underscores
to be used inside URLs (see QTBUG-7434 for example). This code changes the
underscore status to Valid. The same is done to mapped codepoints that
map to underscores combined with other Valid codepoints.
Underscores in domain names are required when using DNS-SD protocol and they
are also allowed by the SMB protocol.
*/
static void resolveIdnaStatus()
{
qDebug("resolveIdnaStatus:");
UnicodeData::valueRef(u'_').idnaRawStatus = IdnaRawStatus::Valid;
for (int codepoint = 0; codepoint <= QChar::LastValidCodePoint; ++codepoint) {
UnicodeData &ud = UnicodeData::valueRef(codepoint);
switch (ud.idnaRawStatus) {
case IdnaRawStatus::Disallowed:
case IdnaRawStatus::Valid:
case IdnaRawStatus::Ignored:
case IdnaRawStatus::Deviation:
case IdnaRawStatus::Mapped:
ud.p.idnaStatus = static_cast<IdnaStatus>(ud.idnaRawStatus);
break;
case IdnaRawStatus::DisallowedStd3Valid:
ud.p.idnaStatus = IdnaStatus::Disallowed;
break;
case IdnaRawStatus::DisallowedStd3Mapped: {
Q_ASSERT(idnaMappingTable.contains(codepoint));
const auto &mapping = idnaMappingTable[codepoint];
bool allow = std::all_of(mapping.begin(), mapping.end(), [](auto c) {
return UnicodeData::valueRef(c).idnaRawStatus == IdnaRawStatus::Valid;
});
if (allow) {
qDebug() << " Allowing" << Qt::hex << codepoint;
ud.p.idnaStatus = IdnaStatus::Mapped;
} else {
ud.p.idnaStatus = IdnaStatus::Disallowed;
idnaMappingTable.remove(codepoint);
}
break;
}
}
}
}
static QByteArray createIdnaMapping()
{
qDebug("createIdnaMapping:");
size_t maxMappingLength = 0;
for (const auto &entry : idnaMappingTable) {
size_t length = 0;
for (char32_t c : entry)
length += QChar::requiresSurrogates(c) ? 2 : 1;
maxMappingLength = qMax(maxMappingLength, length);
}
qDebug() << " max mapping length:" << maxMappingLength;
qsizetype memoryUsage = 0;
QByteArray out =
"struct IdnaMapEntry {\n"
" char32_t codePoint;\n"
" char16_t mapping[" + QByteArray::number(maxMappingLength + 1) + "];\n"
"};\n\n"
"static const IdnaMapEntry idnaMap[] = {\n";
for (auto i = idnaMappingTable.keyValueBegin(); i != idnaMappingTable.keyValueEnd(); i++) {
out += " { 0x" + QByteArray::number(i->first, 16) + ", {";
size_t n = 0;
for (char32_t c : i->second) {
for (auto qc : QChar::fromUcs4(c)) {
out += "0x" + QByteArray::number(qc, 16) + ", ";
n++;
}
}
for (; n < maxMappingLength; n++)
out += "0, ";
out += "0 }},\n";
memoryUsage += 4 + 2 * (maxMappingLength + 1);
}
qDebug() << " memory usage:" << memoryUsage << "bytes";
out +=
"};\n\n"
"Q_CORE_EXPORT const char16_t * QT_FASTCALL idnaMapping(char32_t ucs4) noexcept\n"
"{\n"
" auto i = std::lower_bound(std::begin(idnaMap), std::end(idnaMap), ucs4,\n"
" [](const auto &p, char32_t c) { return p.codePoint < c; });\n"
" if (i != std::end(idnaMap) && i->codePoint == ucs4)\n"
" return i->mapping;\n"
" return nullptr;\n"
"}\n\n";
return out;
}
#if 0 #if 0
static void dump(int from, int to) static void dump(int from, int to)
{ {
@ -2532,9 +2786,12 @@ static QByteArray createPropertyInfo()
out += ", "; out += ", ";
out += QByteArray::number( p.lineBreakClass ); out += QByteArray::number( p.lineBreakClass );
out += ", "; out += ", ";
// " ushort sentenceBreakClass : 8; /* 4 used */\n" // " ushort sentenceBreakClass : 4; /* 4 used */\n"
out += QByteArray::number( p.sentenceBreakClass ); out += QByteArray::number( p.sentenceBreakClass );
out += ", "; out += ", ";
// " ushort idnaStatus : 4; /* 3 used */\n"
out += QByteArray::number( static_cast<unsigned int>(p.idnaStatus) );
out += ", ";
// " ushort script : 8;\n" // " ushort script : 8;\n"
out += QByteArray::number( p.script ); out += QByteArray::number( p.script );
out += " },"; out += " },";
@ -2595,6 +2852,11 @@ static QByteArray createPropertyInfo()
"{\n" "{\n"
" return static_cast<LineBreakClass>(qGetProp(ucs4)->lineBreakClass);\n" " return static_cast<LineBreakClass>(qGetProp(ucs4)->lineBreakClass);\n"
"}\n" "}\n"
"\n"
"Q_CORE_EXPORT IdnaStatus QT_FASTCALL idnaStatus(char32_t ucs4) noexcept\n"
"{\n"
" return static_cast<IdnaStatus>(qGetProp(ucs4)->idnaStatus);\n"
"}\n"
"\n"; "\n";
return out; return out;
@ -3059,6 +3321,7 @@ int main(int, char **)
initSentenceBreak(); initSentenceBreak();
initLineBreak(); initLineBreak();
initScriptMap(); initScriptMap();
initIdnaStatusMap();
readUnicodeData(); readUnicodeData();
readBidiMirroring(); readBidiMirroring();
@ -3074,6 +3337,9 @@ int main(int, char **)
readWordBreak(); readWordBreak();
readSentenceBreak(); readSentenceBreak();
readLineBreak(); readLineBreak();
readIdnaMappingTable();
resolveIdnaStatus();
computeUniqueProperties(); computeUniqueProperties();
QByteArray properties = createPropertyInfo(); QByteArray properties = createPropertyInfo();
@ -3081,6 +3347,7 @@ int main(int, char **)
QByteArray compositions = createCompositionInfo(); QByteArray compositions = createCompositionInfo();
QByteArray ligatures = createLigatureInfo(); QByteArray ligatures = createLigatureInfo();
QByteArray normalizationCorrections = createNormalizationCorrections(); QByteArray normalizationCorrections = createNormalizationCorrections();
QByteArray idnaMapping = createIdnaMapping();
QByteArray header = QByteArray header =
"/****************************************************************************\n" "/****************************************************************************\n"
@ -3150,6 +3417,7 @@ int main(int, char **)
f.write(ligatures); f.write(ligatures);
f.write("\n"); f.write("\n");
f.write(normalizationCorrections); f.write(normalizationCorrections);
f.write(idnaMapping);
f.write("} // namespace QUnicodeTables\n\n"); f.write("} // namespace QUnicodeTables\n\n");
f.write("using namespace QUnicodeTables;\n\n"); f.write("using namespace QUnicodeTables;\n\n");
f.write("QT_END_NAMESPACE\n"); f.write("QT_END_NAMESPACE\n");
@ -3173,6 +3441,7 @@ int main(int, char **)
f.write(word_break_class_string); f.write(word_break_class_string);
f.write(sentence_break_class_string); f.write(sentence_break_class_string);
f.write(line_break_class_string); f.write(line_break_class_string);
f.write(idna_status_string);
f.write(methods); f.write(methods);
f.write("} // namespace QUnicodeTables\n\n" f.write("} // namespace QUnicodeTables\n\n"
"QT_END_NAMESPACE\n\n" "QT_END_NAMESPACE\n\n"