From cf48fc4bc87a62b61a6b9207dce792cd83b6ff65 Mon Sep 17 00:00:00 2001 From: Edward Welbourne Date: Fri, 16 Aug 2024 11:26:59 +0200 Subject: [PATCH] Prune timezone L10n data in preparation for writing it to C++ files The data is very big but much of it is inherited by zones from those that they map to via likely-subtag reduction, so omit the data where it coincides with the result of such an inheritance; this shall complicate the reading of the data, but saves dramatically on its size, reducing it to "only" c. 2 MiB. Task-number: QTBUG-115158 Change-Id: I53ff13e29f1f73a551d73d75773373bb90673c8e Reviewed-by: Mate Barany --- util/locale_database/qlocalexml.py | 136 +++++++++++++++++++++++++ util/locale_database/qlocalexml2cpp.py | 2 + 2 files changed, 138 insertions(+) diff --git a/util/locale_database/qlocalexml.py b/util/locale_database/qlocalexml.py index 3fdad9ed0f9..0f85e8e16de 100644 --- a/util/locale_database/qlocalexml.py +++ b/util/locale_database/qlocalexml.py @@ -115,6 +115,68 @@ class QLocaleXmlReader (object): yield (language, script, territory), locale + def pruneZoneNaming(self, locmap, report=lambda *x: None): + """Deduplicate zoneNaming and metaNaming mapings. + + Where one locale would fall back to another via likely subtag + fallbacks, skip any entries in the former's zoneNaming and metaNaming + where it agrees with the latter. + + This prunes over half of the (locale, zone) table and nearly two + thirds of the (locale, meta) table.""" + likely = tuple((has, got) for have, has, give, got in self.likelyMap()) + def fallbacks(key): + # Should match QtTimeZoneLocale::fallbackLocalesFor() in qlocale.cpp + tried, head = { key }, 2 + while head > 0: + # Retain [:head] of key but use 0 (i.e. Any) for the rest: + it = self.__fillLikely(key[:head] + (0,) * (3 - head), likely) + if it not in tried: + tried.add(it) + if it in locmap: + yield locmap[it] + head -= 1 + + # TODO: fix case of later fallbacks lacking a short name for a + # metazone, where earlier ones with a short name all agree. Maybe do + # similar for long names, and for zones as well as meta. + # For a metazone, the territories in its map to IANA ID, combined with + # the language and script of a locale that lacks names, give locales to + # consult that might fall back to it, and to which to pay particular + # attention. + + zonePrior = metaPrior = 0 + zoneCount = metaCount = locCount = 0 + for key, loc in locmap.items(): + zonePrior += len(loc.zoneNaming) + metaPrior += len(loc.metaNaming) + # Omit zoneNaming and metaNaming entries that match those + # of their likely sub-tag fallbacks. + filtered = False + for alt in fallbacks(key): + filtered = True + # Collect keys to purge before purging, so as not to + # modify mappings while iterating them. + purge = [zone for zone, data in loc.zoneNaming.items() + if (zone in alt.zoneNaming + and data == alt.zoneNaming[zone])] + zoneCount += len(purge) + for zone in purge: + del loc.zoneNaming[zone] + + purge = [meta for meta, data in loc.metaNaming.items() + if (meta in alt.metaNaming + and data == alt.metaNaming[meta])] + metaCount += len(purge) + for meta in purge: + del loc.metaNaming[meta] + if filtered: + locCount += 1 + + report(f'Pruned duplicates: {zoneCount} (of {zonePrior}) zone ' + f'and {metaCount} (of {metaPrior}) metazone ' + f'entries from {locCount} (of {len(locmap)}) locales.\n') + def aliasToIana(self): def attr(elt, key): return elt.attributes[key].nodeValue @@ -282,6 +344,80 @@ class QLocaleXmlReader (object): # Use language, territory, script for sort order: return have[0], have[2], have[1] + @classmethod + def __lowerLikely(cls, key, likely): + """Lower-bound index for key in the likely subtag table + + Equivalent to the std::lower_bound() calls in + QLocaleId::withLikelySubtagsAdded().""" + lo, hi = 0, len(likely) + key = cls.__keyLikely(key) + while lo + 1 < hi: + mid, rem = divmod(lo + hi, 2) + has = cls.__keyLikely(likely[mid][0]) + if has < key: + lo = mid + elif has > key: + hi = mid + else: + return mid + return hi + + @classmethod + def __fillLikely(cls, key, likely): + """Equivalent to QLocaleId::withLikelySubtagsAdded() + + Takes one (language, script, territory) triple, key, of QLocale enum + numeric values and returns another that fills in any zero entries based + on the likely subtag data supplied as likely.""" + lang, script, land = key + if lang and likely: + likely = likely[cls.__lowerLikely(key, likely):] + for entry in likely: + vox, txt, ter = entry[0] + if vox != lang: + break + if land and ter != land: + continue + if script and txt != script: + continue + + vox, txt, ter = entry[1] + return vox, txt or script, ter or land + + if land and likely: + likely = likely[cls.__lowerLikely((0, script, land), likely):] + for entry in likely: + vox, txt, ter = entry[0] + assert not vox, (key, entry) + if ter != land: + break + if txt != script: + continue + + vox, txt, ter = entry[1] + return lang or vox, txt or script, ter + + if script and likely: + likely = likely[cls.__lowerLikely((0, script, 0), likely):] + for entry in likely: + vox, txt, ter = entry[0] + assert not (vox or ter), (key, entry) + if txt != script: + break + + vox, txt, ter = entry[1] + return lang or vox, txt, land or ter + + if not any(key) and likely: + likely = likely[cls.__lowerLikely(key, likely):] + if likely: + assert len(likely) == 1 + assert likely[0][0] == key + return likely[0][1] + + return key + # DOM access: from xml.dom import minidom @staticmethod diff --git a/util/locale_database/qlocalexml2cpp.py b/util/locale_database/qlocalexml2cpp.py index e17e52f8f66..2de74c83378 100755 --- a/util/locale_database/qlocalexml2cpp.py +++ b/util/locale_database/qlocalexml2cpp.py @@ -740,6 +740,7 @@ def main(argv, out, err): parser.add_argument('-q', '--quiet', help='less output', dest='verbose', action='store_const', const=-1) args = parser.parse_args(argv[1:]) + mutter = (lambda *x: None) if args.verbose < 0 else out.write qlocalexml = args.input_file qtsrcdir = Path(args.qtbase_path) @@ -752,6 +753,7 @@ def main(argv, out, err): reader = QLocaleXmlReader(qlocalexml) locale_map = dict(reader.loadLocaleMap(calendars, err.write)) + reader.pruneZoneNaming(locale_map, mutter) locale_keys = sorted(locale_map.keys(), key=LocaleKeySorter(reader.defaultMap())) code_data = LanguageCodeData(args.iso_path)