Add type annotations to CldrAccess

Task-number: QTBUG-129613 Pick-to: 6.8 Change-Id: I8a00cca718554909b7ab9dcad15cc9b9ac702e94 Reviewed-by: Edward Welbourne <edward.welbourne@qt.io>
2024-10-10 17:02:56 +02:00 · 2024-10-10 17:02:56 +02:00 · defd1549de
commit defd1549de
parent adc4ec9d39
2 changed files with 93 additions and 73 deletions
--- a/util/locale_database/cldr.py
+++ b/util/locale_database/cldr.py
@ -10,11 +10,11 @@ The former should normally be all you need to access.
 See individual classes for further detail.
 """
-from typing import Iterable, TextIO
+from typing import Callable, Iterable, Iterator, TextIO
 from xml.dom import minidom
 from weakref import WeakValueDictionary as CacheDict
 from pathlib import Path
-from datetime import datetime
+from datetime import datetime, timedelta
 from ldml import Error, Node, XmlScanner, Supplement, LocaleScanner
 from localetools import names_clash
@ -309,7 +309,7 @@ class CldrReader (object):
 # the cache. If a process were to instantiate this class with distinct
 # roots, each cache would be filled by the first to need it !
 class CldrAccess (object):
-    def __init__(self, root: Path):
+    def __init__(self, root: Path) -> None:
        """Set up a master object for accessing CLDR data.
        Single parameter, root, is the file-system path to the root of
@ -317,20 +317,20 @@ class CldrAccess (object):
        contain dtd/, main/ and supplemental/ sub-directories."""
        self.root = root
-    def xml(self, relative_path: str):
+    def xml(self, relative_path: str) -> XmlScanner:
        """Load a single XML file and return its root element as an XmlScanner.
        The path is interpreted relative to self.root"""
        return XmlScanner(Node(self.__xml(relative_path)))
-    def supplement(self, name):
+    def supplement(self, name: str) -> Supplement:
        """Loads supplemental data as a Supplement object.
        The name should be that of a file in common/supplemental/, without path.
        """
        return Supplement(Node(self.__xml(f'common/supplemental/{name}')))
-    def locale(self, name):
+    def locale(self, name: str) -> LocaleScanner:
        """Loads all data for a locale as a LocaleScanner object.
        The name should be a locale name; adding suffix '.xml' to it
@ -340,7 +340,7 @@ class CldrAccess (object):
        inheritance, where relevant."""
        return LocaleScanner(name, self.__localeRoots(name), self.__rootLocale)
-    def englishNaming(self, tag): # see QLocaleXmlWriter.enumData()
+    def englishNaming(self, tag: str) -> Callable[[str], str]: # see QLocaleXmlWriter.enumData()
        return self.__codeMap(tag).get
    @property
@ -354,18 +354,18 @@ class CldrAccess (object):
                yield path.stem
    @property
-    def defaultContentLocales(self):
+    def defaultContentLocales(self) -> Iterator[str]:
        """Generator for the default content locales."""
        for name, attrs in self.supplement('supplementalMetadata.xml').find('metadata/defaultContent'):
            try:
-                locales = attrs['locales']
+                locales: str = attrs['locales']
            except KeyError:
                pass
            else:
                for locale in locales.split():
                    yield locale
-    def likelySubTags(self):
+    def likelySubTags(self) -> Iterator[tuple[str, str]]:
        for ignore, attrs in self.supplement('likelySubtags.xml').find('likelySubtags'):
            yield attrs['from'], attrs['to']
@ -380,7 +380,7 @@ class CldrAccess (object):
        except KeyError:
            raise Error(f'Unsupported number system: {system}')
-    def weekData(self, territory):
+    def weekData(self, territory: str) -> tuple[str, str, str]:
        """Data on the weekly cycle.
        Returns a triple (W, S, E) of en's short names for week-days;
@ -393,7 +393,7 @@ class CldrAccess (object):
        except KeyError:
            return self.__weekData['001']
-    def currencyData(self, territory):
+    def currencyData(self, territory: str) -> tuple[str, int, int]:
        """Returns currency data for the given territory code.
        Return value is a tuple (ISO4217 code, digit count, rounding
@ -405,7 +405,9 @@ class CldrAccess (object):
        except KeyError:
            return '', 2, 1
-    def codesToIdName(self, language, script, territory, variant = ''):
+    def codesToIdName(self, language: str, script: str, territory: str, variant: str = ''
                     ) -> tuple[tuple[int, str], tuple[int, str],
                                tuple[int, str], tuple[int, str]]:
        """Maps each code to the appropriate ID and name.
        Returns a 4-tuple of (ID, name) pairs corresponding to the
@ -417,7 +419,7 @@ class CldrAccess (object):
        Until we implement variant support (QTBUG-81051), the fourth
        member of the returned tuple is always 0 paired with a string
        that should not be used."""
-        enum = self.__enumMap
+        enum: Callable[[str], dict[str, tuple[int, str]]] = self.__enumMap
        try:
            return (enum('language')[language],
                    enum('script')[script],
@ -428,8 +430,9 @@ class CldrAccess (object):
        parts, values = [], [language, script, territory, variant]
        for index, key in enumerate(('language', 'script', 'territory', 'variant')):
-            naming, enums = self.__codeMap(key), enum(key)
+            naming: dict[str, str] = self.__codeMap(key)
-            value = values[index]
+            enums: dict[str, tuple[int, str]]  = enum(key)
            value: str = values[index]
            if value not in enums:
                text = f'{key} code {value}'
                name = naming.get(value)
@ -447,21 +450,22 @@ class CldrAccess (object):
                    language, script, territory, variant)
    @staticmethod
-    def __checkEnum(given, proper, scraps):
+    def __checkEnum(given: dict[str, str], proper: dict[str, str], scraps: set[str]
                    ) -> Iterator[tuple[str, str]]:
        # Each is a { code: full name } mapping
        for code, name in given.items():
-            try: right = proper[code]
+            try: right: str = proper[code]
            except KeyError:
                # No en.xml name for this code, but supplementalData's
                # parentLocale may still believe in it:
                if code not in scraps:
                    yield name, f'[Found no CLDR name for code {code}]'
                continue
-            cleaned = names_clash(right, name)
+            cleaned: None | str = names_clash(right, name)
            if cleaned:
                yield name, cleaned
-    def checkEnumData(self, grumble):
+    def checkEnumData(self, grumble: Callable[[str], int]) -> None:
        scraps = set()
        for k in self.__parentLocale.keys():
            for f in k.split('_'):
@ -492,7 +496,7 @@ enumdata.py (keeping the old name as an alias):
                        + '\n')
            grumble('\n')
-    def bcp47Aliases(self):
+    def bcp47Aliases(self) -> tuple[dict[str, str], dict[str, str]]:
        """Reads the mapping from CLDR IDs to IANA IDs
        CLDR identifies timezones in various ways but its standard
@ -530,7 +534,8 @@ enumdata.py (keeping the old name as an alias):
        # If we ever need a mapping back to CLDR ID, we can make
        # (description, space-joined-list) the naming values.
-        alias, naming = {}, {} # { alias: iana }, { iana: description }
+        alias: dict[str, str] = {} # { alias: iana }
        naming: dict[str, str] = {} # { iana: description }
        for item, attrs in root.find('keyword/key/type', exclude=('deprecated',)):
            assert 'description' in attrs, item
            assert 'alias' in attrs, item
@ -545,7 +550,8 @@ enumdata.py (keeping the old name as an alias):
        return alias, naming
-    def readWindowsTimeZones(self, alias):
+    def readWindowsTimeZones(self, alias: dict[str, str]) -> tuple[dict[str, str],
                                                                   list[tuple[str, str, str]]]:
        """Digest CLDR's MS-Win time-zone name mapping.
        Single argument, alias, should be the first part of the pair
@ -582,7 +588,8 @@ enumdata.py (keeping the old name as an alias):
        mapZone element and the last is s, its cleaned-up list of IANA
        IDs."""
-        defaults, windows = {}, []
+        defaults: dict[str, str] = {}
        windows: list[tuple[str, str, str]] = []
        zones = self.supplement('windowsZones.xml')
        for name, attrs in zones.find('windowsZones/mapTimezones'):
            if name != 'mapZone':
@ -602,7 +609,10 @@ enumdata.py (keeping the old name as an alias):
        return defaults, windows
-    def readMetaZoneMap(self, alias):
+    def readMetaZoneMap(self, alias: dict[str, str]
                        ) -> tuple[dict[str, dict[str, str]],
                                   dict[str, tuple[tuple[int, int, str], ...]],
                                   dict[str, str]]:
        """Digests the metaZones supplemental data.
        Required argument, alias, should be the first of
@ -633,9 +643,9 @@ enumdata.py (keeping the old name as an alias):
        locale."""
        metaZones = self.supplement('metaZones.xml') # Doesn't appear to use draft attribute
        # Map CLDR name to IANA name (or use CLDR name if unknown to alias):
-        zoneName = lambda n, g=alias.get: g(n, n)
+        zoneName: Callable[[str], str] = lambda n, g=alias.get: g(n, n)
-        metaMap = {} # { meta: { territory code: zoneId } }
+        metaMap: dict[str, dict[str, str]] = {} # { meta: { territory code: zoneId } }
        # Entry with territory 001 is "golden zone" for the metazone.
        for mapMeta in metaZones.findNodes('metaZones/mapTimezones'):
            attrs = mapMeta.attributes()
@ -646,13 +656,13 @@ enumdata.py (keeping the old name as an alias):
                raise Error('Version of metazone map type is not 2018e', attrs)
            for node in mapMeta.findAllChildren('mapZone'):
-                attrs = node.attributes()
+                attrs: dict[str, str] = node.attributes()
                try:
                    meta, code, zone = attrs['other'], attrs['territory'], attrs['type']
                except KeyError:
                    continue
-                bok = metaMap.setdefault(meta, {})
+                bok: dict[str, str] = metaMap.setdefault(meta, {})
                assert code not in bok, (meta, code)
                bok[code] = zoneName(zone)
        # Territories not named in a metaMap entry fall back on the
@ -660,16 +670,16 @@ enumdata.py (keeping the old name as an alias):
        # entry:
        assert all('001' in bok for bok in metaMap.values())
-        def scanUses(zone, check=metaMap):
+        def scanUses(zone: Node, check=metaMap) -> Iterator[tuple[str|None, str|None, str]]:
            for node in zone.findAllChildren('usesMetazone'):
-                attrs = node.attributes()
+                attrs: dict[str, str] = node.attributes()
-                mzone = attrs['mzone']
+                mzone: str = attrs['mzone']
                if mzone not in check:
                    raise Error('Unknown metazone', mzone)
                # These are UTC date-times.
                yield attrs.get('from'), attrs.get('to'), mzone
-        def sortKey(triple):
+        def sortKey(triple: tuple[str|None, str|None, str]) -> str | None:
            start, stop, mzone = triple
            # The start = None entry should sort first; since its key
            # is its stop, which is likely the next entry's start, we
@ -680,11 +690,11 @@ enumdata.py (keeping the old name as an alias):
            # in the list, so the sorting is fatuous and the key
            # doesn't matter).
-        def timeRep(text, notime, epoch=datetime(1970, 1, 1, 0, 0)):
+        def timeRep(text: str, notime: bool, epoch=datetime(1970, 1, 1, 0, 0)) -> int:
            """Map a 'yyyy-MM-dd HH:mm' string to epoch minutes.
            If the HH:mm part is omitted, second parameter notime is true to
-            use the end of the day, false for the start. LDM specifies this
+            use the end of the day, false for the start. LDML specifies this
            reading of the pure-date values for start and stop attributes.  If
            the HH:mm part is 24:00, the end of the day is also used; LDML
            specifies this but python's datetime.fromisoformat() doesn't like
@ -704,16 +714,20 @@ enumdata.py (keeping the old name as an alias):
                    assert len(text) == 16, text
                # If it's given with HH:mm as 24:00, this throws:
-                diff = datetime.fromisoformat(text) - epoch
+                diff: timedelta = datetime.fromisoformat(text) - epoch
            except ValueError:
                diff = datetime.fromisoformat(text[:10]) - epoch
                diff += diff.__class__(days=1)
            assert diff.days >= 0 and diff.seconds >= 0, (diff, text)
-            assert diff.seconds % 60 == 0, (diff, text)
+            mins, secs = divmod(diff.seconds, 60)
-            return diff.days * 1440 + int(diff.seconds / 60)
+            assert secs == 0, (diff, text)
            return diff.days * 1440 + mins
-        def mapTimes(triple, alpha=0, omega=(1<<32)-1, torep=timeRep):
+        def mapTimes(triple: tuple[str|None, str|None, str],
                     alpha: int = 0, omega: int = (1<<32) - 1,
                     torep: Callable[[str, bool, datetime], int] = timeRep
                     ) -> tuple[int, int, str]:
            start, stop, mzone = triple
            start = alpha if start is None else torep(start, False)
            stop = omega if stop is None else torep(stop, True)
@ -723,10 +737,11 @@ enumdata.py (keeping the old name as an alias):
                stop = omega
            return start, stop, mzone
-        zones = {} # { ianaId: ( (from, to, meta), ... ) }
+        # zones is { ianaId: ( (from, to, meta), ... ) }
        zones: dict[str, tuple[tuple[int, int, str], ...]] = {}
        for metaInfo in metaZones.findNodes('metaZones/metazoneInfo'):
            for zone in metaInfo.findAllChildren('timezone'):
-                iana = zoneName(zone.dom.attributes['type'].value)
+                iana: str = zoneName(zone.dom.attributes['type'].value)
                story = tuple(sorted(scanUses(zone), key=sortKey))
                # Only {first,last} entry can have None for {from,to}:
                assert not any(s[0] is None for s in story[1:]), (iana, story)
@ -743,7 +758,7 @@ enumdata.py (keeping the old name as an alias):
                       for zone in bok.values())
                   for metaz, bok in metaMap.items())
-        territorial = {} # { territory code: IANA ID }
+        territorial: dict[str, str] = {} # { territory code: IANA ID }
        for prime in metaZones.findNodes('primaryZones/primaryZone'):
            code = prime.attributes()['iso3166']
            assert code not in territorial, code
@ -752,36 +767,36 @@ enumdata.py (keeping the old name as an alias):
        return metaMap, zones, territorial
    @property
-    def cldrVersion(self):
+    def cldrVersion(self) -> str:
        # Evaluate so as to ensure __cldrVersion is set:
        self.__unDistinguishedAttributes
        return self.__cldrVersion
    # Implementation details
-    def __xml(self, relative_path: str, cache = CacheDict(), read = minidom.parse):
+    def __xml(self, relPath: str, cache = CacheDict(), read = minidom.parse) -> minidom.Element:
        try:
-            doc = cache[relative_path]
+            doc: minidom.Element = cache[relPath]
        except KeyError:
-            cache[relative_path] = doc = read(str(self.root.joinpath(relative_path))).documentElement
+            cache[relPath] = doc = read(str(self.root.joinpath(relPath))).documentElement
        return doc
    def __open(self, relative_path: str) -> TextIO:
        return self.root.joinpath(relative_path).open()
    @property
-    def __rootLocale(self, cache = []):
+    def __rootLocale(self, cache: list[XmlScanner] = []) -> XmlScanner:
        if not cache:
            cache.append(self.xml('common/main/root.xml'))
        return cache[0]
    @property
-    def __supplementalData(self, cache = []):
+    def __supplementalData(self, cache: list[Supplement] = []) -> Supplement:
        if not cache:
            cache.append(self.supplement('supplementalData.xml'))
        return cache[0]
    @property
-    def __numberSystems(self, cache = {}):
+    def __numberSystems(self, cache: dict[str, dict[str, str]] = {}) -> dict[str, dict[str, str]]:
        if not cache:
            for ignore, attrs in self.supplement('numberingSystems.xml').find('numberingSystems'):
                cache[attrs['id']] = attrs
@ -789,20 +804,22 @@ enumdata.py (keeping the old name as an alias):
        return cache
    @property
-    def __weekData(self, cache = {}):
+    def __weekData(self, cache: dict[str, tuple[str, str, str]] = {}
                   ) -> dict[str, tuple[str, str, str]]:
        if not cache:
            # firstDay, weStart and weEnd are all dict[str, str]
            firstDay, weStart, weEnd = self.__getWeekData()
            # Massage those into an easily-consulted form:
            # World defaults given for code '001':
            mon, sat, sun = firstDay['001'], weStart['001'], weEnd['001']
-            lands = set(firstDay) | set(weStart) | set(weEnd)
+            lands: set[str] = set(firstDay) | set(weStart) | set(weEnd)
            cache.update((land,
                          (firstDay.get(land, mon), weStart.get(land, sat), weEnd.get(land, sun)))
                         for land in lands)
            assert cache
        return cache
-    def __getWeekData(self):
+    def __getWeekData(self) -> Iterator[dict[str, str]]:
        """Scan for data on the weekly cycle.
        Yields three mappings from locales to en's short names for
@ -811,12 +828,12 @@ enumdata.py (keeping the old name as an alias):
        gives the day on which the week starts, the second gives the
        day on which the week-end starts, the third gives the last day
        of the week-end."""
-        source = self.__supplementalData
+        source: Supplement = self.__supplementalData
        for key in ('firstDay', 'weekendStart', 'weekendEnd'):
-            result = {}
+            result: dict[str, str] = {}
            for ignore, attrs in source.find(f'weekData/{key}'):
                assert ignore == key
-                day = attrs['day']
+                day: str = attrs['day']
                assert day in ('mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun'), day
                if 'alt' in attrs:
                    continue
@ -825,7 +842,8 @@ enumdata.py (keeping the old name as an alias):
            yield result
    @property
-    def __currencyData(self, cache = {}):
+    def __currencyData(self, cache: dict[str, tuple[str, int, int]] = {}
                       ) -> dict[str, tuple[str, int, int]]:
        if not cache:
            source = self.__supplementalData
            for elt in source.findNodes('currencyData/region'):
@ -850,15 +868,16 @@ enumdata.py (keeping the old name as an alias):
                if iso:
                    for tag, data in source.find(
                        f'currencyData/fractions/info[iso4217={iso}]'):
-                        digits = data['digits']
+                        digits = int(data['digits'])
-                        rounding = data['rounding']
+                        rounding = int(data['rounding'])
                cache[territory] = iso, digits, rounding
            assert cache
        return cache
    @property
-    def __unDistinguishedAttributes(self, cache = {}):
+    def __unDistinguishedAttributes(self, cache: dict[str, tuple[str, ...]] = {}
                                    ) -> dict[str, tuple[str, ...]]:
        """Mapping from tag names to lists of attributes.
        LDML defines some attributes as 'distinguishing': if a node
@ -878,7 +897,7 @@ enumdata.py (keeping the old name as an alias):
        return cache
-    def __scanLdmlDtd(self):
+    def __scanLdmlDtd(self) -> Iterator[tuple[str, tuple[str, ...]]]:
        """Scan the LDML DTD, record CLDR version
        Yields (tag, attrs) pairs: on elements with a given tag,
@ -920,7 +939,8 @@ enumdata.py (keeping the old name as an alias):
            if tag and ignored:
                yield tag, tuple(ignored)
-    def __enumMap(self, key, cache = {}):
+    def __enumMap(self, key: str, cache: dict[str, dict[str, tuple[int, str]]] = {}
                  ) -> dict[str, tuple[int, str]]:
        if not cache:
            cache['variant'] = {'': (0, 'This should never be seen outside ldml.py')}
            # They're mappings from numeric value to pairs of full
@ -943,19 +963,19 @@ enumdata.py (keeping the old name as an alias):
        return cache[key]
-    def __codeMap(self, key, cache = {},
+    def __codeMap(self, key: str, cache: dict[str, dict[str, str]] = {},
                  # Maps our name for it to CLDR's name:
                  naming = {'language': 'languages', 'script': 'scripts',
-                            'territory': 'territories', 'variant': 'variants'}):
+                            'territory': 'territories', 'variant': 'variants'}) -> dict[str, str]:
        if not cache:
-            root = self.xml('common/main/en.xml').root.findUniqueChild('localeDisplayNames')
+            root: Node = self.xml('common/main/en.xml').root.findUniqueChild('localeDisplayNames')
            for dst, src in naming.items():
                cache[dst] = dict(self.__codeMapScan(root.findUniqueChild(src)))
            assert cache
        return cache[key]
-    def __codeMapScan(self, node):
+    def __codeMapScan(self, node: Node) -> Iterator[tuple[str, str]]:
        """Get mapping from codes to element values.
        Passed in node is a <languages>, <scripts>, <territories> or
@ -986,23 +1006,23 @@ enumdata.py (keeping the old name as an alias):
    # CLDR uses inheritance between locales to save repetition:
    @property
-    def __parentLocale(self, cache = {}):
+    def __parentLocale(self, cache: dict[str, str] = {}) -> dict[str, str]:
        # see http://www.unicode.org/reports/tr35/#Parent_Locales
        if not cache:
            for tag, attrs in self.__supplementalData.find('parentLocales',
                                                           ('component',)):
-                parent = attrs.get('parent', '')
+                parent: str = attrs.get('parent', '')
                for child in attrs['locales'].split():
                    cache[child] = parent
            assert cache
        return cache
-    def __scanLocaleRoots(self, name: str):
+    def __scanLocaleRoots(self, name: str) -> Iterator[Node]:
        while name and name != 'root':
            path = f'common/main/{name}.xml'
            if self.root.joinpath(path).exists():
-                elt = self.__xml(path) # which has no top-level alias children:
+                elt: minidom.Element = self.__xml(path) # which has no top-level alias children:
                assert not any(True
                               for child in Node(elt).findAllChildren(
                                       'alias', allDull=True)
@ -1019,11 +1039,11 @@ enumdata.py (keeping the old name as an alias):
                    break
    class __Seq (list): pass # No weakref for tuple and list, but list sub-class is ok.
-    def __localeRoots(self, name, cache = CacheDict()):
+    def __localeRoots(self, name: str, cache = CacheDict()) -> __Seq:
        try:
-            chain = cache[name]
+            chain: CldrAccess.__Seq = cache[name]
        except KeyError:
-            cache[name] = chain = self.__Seq(self.__scanLocaleRoots(name))
+            cache[name] = chain = CldrAccess.__Seq(self.__scanLocaleRoots(name))
        return chain
 # Unpolute the namespace: we don't need to export these.
--- a/util/locale_database/localetools.py
+++ b/util/locale_database/localetools.py
@ -64,7 +64,7 @@ def wrap_list(lst, perline=20):
            yield head
    return ",\n".join(", ".join(x) for x in split(lst, perline))
-def names_clash(cldr, enum):
+def names_clash(cldr: str, enum: str) -> None | str:
    """True if the reader might not recognize cldr as the name of enum
    First argument, cldr, is the name CLDR gives for some language,