Rework QLocale's likely sub-tag addition

Instead of looking up each candidate pattern in a separte O(log(n))
search, exploit the fact that the array is in the right order to put
each candidate we try after the ones we'd have preferred over it.

At the same time, add und_script_region and und_region searches, which
aren't mentioned in the spec's algorithm but are clearly meant to be
searched (the spec's examples include some). Also, document what's
going on - because it's a bit complicated !

Change-Id: Id88ced335b0d2dfd18fb59c9a3dc75571f2a44ef
Reviewed-by: Mårten Nordheim <marten.nordheim@qt.io>
This commit is contained in:
Edward Welbourne 2020-10-12 15:02:50 +02:00
parent a9e4bf7eef
commit 8693d473e5

View File

@ -201,7 +201,14 @@ QLatin1String QLocalePrivate::countryToCode(QLocale::Country country)
return QLatin1String(reinterpret_cast<const char*>(c), c[2] == 0 ? 2 : 3); return QLatin1String(reinterpret_cast<const char*>(c), c[2] == 0 ? 2 : 3);
} }
static int cmpLikelySubtag(const void *lhs, const void *rhs) namespace {
struct LikelyPair
{
QLocaleId key; // Search key.
QLocaleId value = QLocaleId { 0, 0, 0 };
};
bool operator<(const LikelyPair &lhs, const LikelyPair &rhs)
{ {
// Must match the comparison LocaleDataWriter.likelySubtags() uses when // Must match the comparison LocaleDataWriter.likelySubtags() uses when
// sorting, see qtbase/util/locale_database.qlocalexml2cpp.py // sorting, see qtbase/util/locale_database.qlocalexml2cpp.py
@ -210,73 +217,119 @@ static int cmpLikelySubtag(const void *lhs, const void *rhs)
const int huge = 0x10000; const int huge = 0x10000;
return (lhs ? lhs : huge) - (rhs ? rhs : huge); return (lhs ? lhs : huge) - (rhs ? rhs : huge);
}; };
const auto &left = *reinterpret_cast<const QLocaleId *>(lhs); const auto &left = lhs.key;
const auto &right = *reinterpret_cast<const QLocaleId *>(rhs); const auto &right = rhs.key;
// Comparison order: language, region, script:
if (int cmp = compare(left.language_id, right.language_id)) if (int cmp = compare(left.language_id, right.language_id))
return cmp; return cmp < 0;
if (int cmp = compare(left.country_id, right.country_id)) if (int cmp = compare(left.country_id, right.country_id))
return cmp; return cmp < 0;
return compare(left.script_id, right.script_id); return compare(left.script_id, right.script_id) < 0;
} }
} // anonymous namespace
// http://www.unicode.org/reports/tr35/#Likely_Subtags /*!
static bool addLikelySubtags(QLocaleId &localeId) Fill in blank fields of a locale ID.
{
// Array is overtly of QLocaleId but to be interpreted as of pairs, mapping
// each even entry to the following odd entry. So search only the even
// entries for a match and return the matching odd entry, if found.
static_assert(std::size(likely_subtags) % 2 == 0);
const auto *p = reinterpret_cast<const QLocaleId *>(
bsearch(&localeId,
likely_subtags, std::size(likely_subtags) / 2, 2 * sizeof(QLocaleId),
cmpLikelySubtag));
if (!p)
return false;
Q_ASSERT(p >= likely_subtags && p < likely_subtags + std::size(likely_subtags));
Q_ASSERT((p - likely_subtags) % 2 == 0);
localeId = p[1];
return true;
}
An ID in which some fields are zero stands for any locale that agrees with
it in its non-zero fields. CLDR's likely-subtag data is meant to help us
chose which candidate to prefer. (Note, however, that CLDR does have some
cases where it maps an ID to a "best match" for which CLDR does not provide
data, even though there are locales for which CLDR does provide data that do
match the given ID. It's telling us, unhelpfully but truthfully, what
locale would (most likely) be meant by (someone using) the combination
requested, even when that locale isn't yet supported.) It may also map an
obsolete or generic tag to a modern or more specific replacement, possibly
filling in some of the other fields in the process (presently only for
countries). Note that some fields of the result may remain blank, but there
is no more specific recommendation available.
For the formal specification, see
http://www.unicode.org/reports/tr35/#Likely_Subtags
\note We also search und_script_region and und_region; they're not mentioned
in the spec, but the examples clearly presume them and CLDR does provide
such likely matches.
*/
QLocaleId QLocaleId::withLikelySubtagsAdded() const QLocaleId QLocaleId::withLikelySubtagsAdded() const
{ {
// language_script_region /* Each pattern that appears in a comments below, language_script_region and
if (language_id || script_id || country_id) { similar, indicates which of this's fields (even if blank) are being
QLocaleId id { language_id, script_id, country_id }; attended to in a given search; for fields left out of the pattern, the
if (addLikelySubtags(id)) search uses 0 regardless of whether this has specified the field.
return id;
} If a key matches what we're searching for (possibly with a wildcard in
// language_region the key matching a non-wildcard in our search), the tags from this that
if (script_id) { are specified in the key are replaced by the match (even if different);
QLocaleId id { language_id, 0, country_id }; but the other tags of this replace what's in the match (even when the
if (addLikelySubtags(id)) { match does specify a value).
id.script_id = script_id; */
return id; static_assert(std::size(likely_subtags) % 2 == 0);
} auto *pairs = reinterpret_cast<const LikelyPair *>(likely_subtags);
} auto *const afterPairs = pairs + std::size(likely_subtags) / 2;
// language_script LikelyPair sought { *this };
if (country_id) { // Our array is sorted in the order that puts all candidate matches in the
QLocaleId id { language_id, script_id, 0 }; // order we would want them; ones we should prefer appear before the others.
if (addLikelySubtags(id)) {
id.country_id = country_id;
return id;
}
}
// language
if (script_id && country_id) {
QLocaleId id { language_id, 0, 0 };
if (addLikelySubtags(id)) {
id.script_id = script_id;
id.country_id = country_id;
return id;
}
}
// und_script
if (language_id) { if (language_id) {
QLocaleId id { 0, script_id, 0 }; // language_script_region, language_region, language_script, language:
if (addLikelySubtags(id)) { pairs = std::lower_bound(pairs, afterPairs, sought);
id.language_id = language_id; // Single language's block isn't long enough to warrant more binary
return id; // chopping within it - just traverse it all:
for (; pairs < afterPairs && pairs->key.language_id == language_id; ++pairs) {
const QLocaleId &key = pairs->key;
if (key.country_id && key.country_id != country_id)
continue;
if (key.script_id && key.script_id != script_id)
continue;
QLocaleId value = pairs->value;
if (country_id && !key.country_id)
value.country_id = country_id;
if (script_id && !key.script_id)
value.script_id = script_id;
return value;
}
}
// und_script_region or und_region (in that order):
if (country_id) {
sought.key = QLocaleId { 0, script_id, country_id };
pairs = std::lower_bound(pairs, afterPairs, sought);
// Again, individual und_?_region block isn't long enough to make binary
// chop a win:
for (; pairs < afterPairs && pairs->key.country_id == country_id; ++pairs) {
const QLocaleId &key = pairs->key;
Q_ASSERT(!key.language_id);
if (key.script_id && key.script_id != script_id)
continue;
QLocaleId value = pairs->value;
if (language_id)
value.language_id = language_id;
if (script_id && !key.script_id)
value.script_id = script_id;
return value;
}
}
// und_script:
if (script_id) {
sought.key = QLocaleId { 0, script_id, 0 };
pairs = std::lower_bound(pairs, afterPairs, sought);
if (pairs < afterPairs && pairs->key.script_id == script_id) {
Q_ASSERT(!pairs->key.language_id && !pairs->key.country_id);
QLocaleId value = pairs->value;
if (language_id)
value.language_id = language_id;
if (country_id)
value.country_id = country_id;
return value;
}
}
if (matchesAll()) { // Skipped all of the above.
// CLDR has no match-all at v37, but might get one some day ...
pairs = std::lower_bound(pairs, afterPairs, sought);
if (pairs < afterPairs) {
// All other keys are < match-all.
Q_ASSERT(pairs + 1 == afterPairs);
Q_ASSERT(pairs->key.matchesAll());
return pairs->value;
} }
} }
return *this; return *this;