Rework QLocale's likely sub-tag addition

Instead of looking up each candidate pattern in a separte O(log(n)) search, exploit the fact that the array is in the right order to put each candidate we try after the ones we'd have preferred over it. At the same time, add und_script_region and und_region searches, which aren't mentioned in the spec's algorithm but are clearly meant to be searched (the spec's examples include some). Also, document what's going on - because it's a bit complicated ! Change-Id: Id88ced335b0d2dfd18fb59c9a3dc75571f2a44ef Reviewed-by: Mårten Nordheim <marten.nordheim@qt.io>
2020-10-12 15:02:50 +02:00 · 2020-10-12 15:02:50 +02:00 · 8693d473e5
commit 8693d473e5
parent a9e4bf7eef
1 changed files with 113 additions and 60 deletions
--- a/src/corelib/text/qlocale.cpp
+++ b/src/corelib/text/qlocale.cpp
@ -201,7 +201,14 @@ QLatin1String QLocalePrivate::countryToCode(QLocale::Country country)
    return QLatin1String(reinterpret_cast<const char*>(c), c[2] == 0 ? 2 : 3);
 }
-static int cmpLikelySubtag(const void *lhs, const void *rhs)
+namespace {
 struct LikelyPair
 {
    QLocaleId key; // Search key.
    QLocaleId value = QLocaleId { 0, 0, 0 };
 };
 bool operator<(const LikelyPair &lhs, const LikelyPair &rhs)
 {
    // Must match the comparison LocaleDataWriter.likelySubtags() uses when
    // sorting, see qtbase/util/locale_database.qlocalexml2cpp.py
@ -210,73 +217,119 @@ static int cmpLikelySubtag(const void *lhs, const void *rhs)
        const int huge = 0x10000;
        return (lhs ? lhs : huge) - (rhs ? rhs : huge);
    };
-    const auto &left = *reinterpret_cast<const QLocaleId *>(lhs);
+    const auto &left = lhs.key;
-    const auto &right = *reinterpret_cast<const QLocaleId *>(rhs);
+    const auto &right = rhs.key;
    // Comparison order: language, region, script:
    if (int cmp = compare(left.language_id, right.language_id))
-        return cmp;
+        return cmp < 0;
    if (int cmp = compare(left.country_id, right.country_id))
-        return cmp;
+        return cmp < 0;
-    return compare(left.script_id, right.script_id);
+    return compare(left.script_id, right.script_id) < 0;
 }
 } // anonymous namespace
-// http://www.unicode.org/reports/tr35/#Likely_Subtags
+/*!
-static bool addLikelySubtags(QLocaleId &localeId)
+    Fill in blank fields of a locale ID.
 {
    // Array is overtly of QLocaleId but to be interpreted as of pairs, mapping
    // each even entry to the following odd entry.  So search only the even
    // entries for a match and return the matching odd entry, if found.
    static_assert(std::size(likely_subtags) % 2 == 0);
    const auto *p = reinterpret_cast<const QLocaleId *>(
        bsearch(&localeId,
                likely_subtags, std::size(likely_subtags) / 2, 2 * sizeof(QLocaleId),
                cmpLikelySubtag));
    if (!p)
        return false;
    Q_ASSERT(p >= likely_subtags && p < likely_subtags + std::size(likely_subtags));
    Q_ASSERT((p - likely_subtags) % 2 == 0);
    localeId = p[1];
    return true;
 }
    An ID in which some fields are zero stands for any locale that agrees with
    it in its non-zero fields.  CLDR's likely-subtag data is meant to help us
    chose which candidate to prefer.  (Note, however, that CLDR does have some
    cases where it maps an ID to a "best match" for which CLDR does not provide
    data, even though there are locales for which CLDR does provide data that do
    match the given ID.  It's telling us, unhelpfully but truthfully, what
    locale would (most likely) be meant by (someone using) the combination
    requested, even when that locale isn't yet supported.)  It may also map an
    obsolete or generic tag to a modern or more specific replacement, possibly
    filling in some of the other fields in the process (presently only for
    countries).  Note that some fields of the result may remain blank, but there
    is no more specific recommendation available.
    For the formal specification, see
    http://www.unicode.org/reports/tr35/#Likely_Subtags
    \note We also search und_script_region and und_region; they're not mentioned
    in the spec, but the examples clearly presume them and CLDR does provide
    such likely matches.
 */
 QLocaleId QLocaleId::withLikelySubtagsAdded() const
 {
-    // language_script_region
+    /* Each pattern that appears in a comments below, language_script_region and
-    if (language_id || script_id || country_id) {
+       similar, indicates which of this's fields (even if blank) are being
-        QLocaleId id { language_id, script_id, country_id };
+       attended to in a given search; for fields left out of the pattern, the
-        if (addLikelySubtags(id))
+       search uses 0 regardless of whether this has specified the field.
-            return id;
+
-    }
+       If a key matches what we're searching for (possibly with a wildcard in
-    // language_region
+       the key matching a non-wildcard in our search), the tags from this that
-    if (script_id) {
+       are specified in the key are replaced by the match (even if different);
-        QLocaleId id { language_id, 0, country_id };
+       but the other tags of this replace what's in the match (even when the
-        if (addLikelySubtags(id)) {
+       match does specify a value).
-            id.script_id = script_id;
+    */
-            return id;
+    static_assert(std::size(likely_subtags) % 2 == 0);
-        }
+    auto *pairs = reinterpret_cast<const LikelyPair *>(likely_subtags);
-    }
+    auto *const afterPairs = pairs + std::size(likely_subtags) / 2;
-    // language_script
+    LikelyPair sought { *this };
-    if (country_id) {
+    // Our array is sorted in the order that puts all candidate matches in the
-        QLocaleId id { language_id, script_id, 0 };
+    // order we would want them; ones we should prefer appear before the others.
        if (addLikelySubtags(id)) {
            id.country_id = country_id;
            return id;
        }
    }
    // language
    if (script_id && country_id) {
        QLocaleId id { language_id, 0, 0 };
        if (addLikelySubtags(id)) {
            id.script_id = script_id;
            id.country_id = country_id;
            return id;
        }
    }
    // und_script
    if (language_id) {
-        QLocaleId id { 0, script_id, 0 };
+        // language_script_region, language_region, language_script, language:
-        if (addLikelySubtags(id)) {
+        pairs = std::lower_bound(pairs, afterPairs, sought);
-            id.language_id = language_id;
+        // Single language's block isn't long enough to warrant more binary
-            return id;
+        // chopping within it - just traverse it all:
        for (; pairs < afterPairs && pairs->key.language_id == language_id; ++pairs) {
            const QLocaleId &key = pairs->key;
            if (key.country_id && key.country_id != country_id)
                continue;
            if (key.script_id && key.script_id != script_id)
                continue;
            QLocaleId value = pairs->value;
            if (country_id && !key.country_id)
                value.country_id = country_id;
            if (script_id && !key.script_id)
                value.script_id = script_id;
            return value;
        }
    }
    // und_script_region or und_region (in that order):
    if (country_id) {
        sought.key = QLocaleId { 0, script_id, country_id };
        pairs = std::lower_bound(pairs, afterPairs, sought);
        // Again, individual und_?_region block isn't long enough to make binary
        // chop a win:
        for (; pairs < afterPairs && pairs->key.country_id == country_id; ++pairs) {
            const QLocaleId &key = pairs->key;
            Q_ASSERT(!key.language_id);
            if (key.script_id && key.script_id != script_id)
                continue;
            QLocaleId value = pairs->value;
            if (language_id)
                value.language_id = language_id;
            if (script_id && !key.script_id)
                value.script_id = script_id;
            return value;
        }
    }
    // und_script:
    if (script_id) {
        sought.key = QLocaleId { 0, script_id, 0 };
        pairs = std::lower_bound(pairs, afterPairs, sought);
        if (pairs < afterPairs && pairs->key.script_id == script_id) {
            Q_ASSERT(!pairs->key.language_id && !pairs->key.country_id);
            QLocaleId value = pairs->value;
            if (language_id)
                value.language_id = language_id;
            if (country_id)
                value.country_id = country_id;
            return value;
        }
    }
    if (matchesAll()) { // Skipped all of the above.
        // CLDR has no match-all at v37, but might get one some day ...
        pairs = std::lower_bound(pairs, afterPairs, sought);
        if (pairs < afterPairs) {
            // All other keys are < match-all.
            Q_ASSERT(pairs + 1 == afterPairs);
            Q_ASSERT(pairs->key.matchesAll());
            return pairs->value;
        }
    }
    return *this;