Add type annotations to CldrAccess

Task-number: QTBUG-129613
Pick-to: 6.8
Change-Id: I8a00cca718554909b7ab9dcad15cc9b9ac702e94
Reviewed-by: Edward Welbourne <edward.welbourne@qt.io>
This commit is contained in:
Mate Barany 2024-10-10 17:02:56 +02:00
parent adc4ec9d39
commit defd1549de
2 changed files with 93 additions and 73 deletions

View File

@ -10,11 +10,11 @@ The former should normally be all you need to access.
See individual classes for further detail.
"""
from typing import Iterable, TextIO
from typing import Callable, Iterable, Iterator, TextIO
from xml.dom import minidom
from weakref import WeakValueDictionary as CacheDict
from pathlib import Path
from datetime import datetime
from datetime import datetime, timedelta
from ldml import Error, Node, XmlScanner, Supplement, LocaleScanner
from localetools import names_clash
@ -309,7 +309,7 @@ class CldrReader (object):
# the cache. If a process were to instantiate this class with distinct
# roots, each cache would be filled by the first to need it !
class CldrAccess (object):
def __init__(self, root: Path):
def __init__(self, root: Path) -> None:
"""Set up a master object for accessing CLDR data.
Single parameter, root, is the file-system path to the root of
@ -317,20 +317,20 @@ class CldrAccess (object):
contain dtd/, main/ and supplemental/ sub-directories."""
self.root = root
def xml(self, relative_path: str):
def xml(self, relative_path: str) -> XmlScanner:
"""Load a single XML file and return its root element as an XmlScanner.
The path is interpreted relative to self.root"""
return XmlScanner(Node(self.__xml(relative_path)))
def supplement(self, name):
def supplement(self, name: str) -> Supplement:
"""Loads supplemental data as a Supplement object.
The name should be that of a file in common/supplemental/, without path.
"""
return Supplement(Node(self.__xml(f'common/supplemental/{name}')))
def locale(self, name):
def locale(self, name: str) -> LocaleScanner:
"""Loads all data for a locale as a LocaleScanner object.
The name should be a locale name; adding suffix '.xml' to it
@ -340,7 +340,7 @@ class CldrAccess (object):
inheritance, where relevant."""
return LocaleScanner(name, self.__localeRoots(name), self.__rootLocale)
def englishNaming(self, tag): # see QLocaleXmlWriter.enumData()
def englishNaming(self, tag: str) -> Callable[[str], str]: # see QLocaleXmlWriter.enumData()
return self.__codeMap(tag).get
@property
@ -354,18 +354,18 @@ class CldrAccess (object):
yield path.stem
@property
def defaultContentLocales(self):
def defaultContentLocales(self) -> Iterator[str]:
"""Generator for the default content locales."""
for name, attrs in self.supplement('supplementalMetadata.xml').find('metadata/defaultContent'):
try:
locales = attrs['locales']
locales: str = attrs['locales']
except KeyError:
pass
else:
for locale in locales.split():
yield locale
def likelySubTags(self):
def likelySubTags(self) -> Iterator[tuple[str, str]]:
for ignore, attrs in self.supplement('likelySubtags.xml').find('likelySubtags'):
yield attrs['from'], attrs['to']
@ -380,7 +380,7 @@ class CldrAccess (object):
except KeyError:
raise Error(f'Unsupported number system: {system}')
def weekData(self, territory):
def weekData(self, territory: str) -> tuple[str, str, str]:
"""Data on the weekly cycle.
Returns a triple (W, S, E) of en's short names for week-days;
@ -393,7 +393,7 @@ class CldrAccess (object):
except KeyError:
return self.__weekData['001']
def currencyData(self, territory):
def currencyData(self, territory: str) -> tuple[str, int, int]:
"""Returns currency data for the given territory code.
Return value is a tuple (ISO4217 code, digit count, rounding
@ -405,7 +405,9 @@ class CldrAccess (object):
except KeyError:
return '', 2, 1
def codesToIdName(self, language, script, territory, variant = ''):
def codesToIdName(self, language: str, script: str, territory: str, variant: str = ''
) -> tuple[tuple[int, str], tuple[int, str],
tuple[int, str], tuple[int, str]]:
"""Maps each code to the appropriate ID and name.
Returns a 4-tuple of (ID, name) pairs corresponding to the
@ -417,7 +419,7 @@ class CldrAccess (object):
Until we implement variant support (QTBUG-81051), the fourth
member of the returned tuple is always 0 paired with a string
that should not be used."""
enum = self.__enumMap
enum: Callable[[str], dict[str, tuple[int, str]]] = self.__enumMap
try:
return (enum('language')[language],
enum('script')[script],
@ -428,8 +430,9 @@ class CldrAccess (object):
parts, values = [], [language, script, territory, variant]
for index, key in enumerate(('language', 'script', 'territory', 'variant')):
naming, enums = self.__codeMap(key), enum(key)
value = values[index]
naming: dict[str, str] = self.__codeMap(key)
enums: dict[str, tuple[int, str]] = enum(key)
value: str = values[index]
if value not in enums:
text = f'{key} code {value}'
name = naming.get(value)
@ -447,21 +450,22 @@ class CldrAccess (object):
language, script, territory, variant)
@staticmethod
def __checkEnum(given, proper, scraps):
def __checkEnum(given: dict[str, str], proper: dict[str, str], scraps: set[str]
) -> Iterator[tuple[str, str]]:
# Each is a { code: full name } mapping
for code, name in given.items():
try: right = proper[code]
try: right: str = proper[code]
except KeyError:
# No en.xml name for this code, but supplementalData's
# parentLocale may still believe in it:
if code not in scraps:
yield name, f'[Found no CLDR name for code {code}]'
continue
cleaned = names_clash(right, name)
cleaned: None | str = names_clash(right, name)
if cleaned:
yield name, cleaned
def checkEnumData(self, grumble):
def checkEnumData(self, grumble: Callable[[str], int]) -> None:
scraps = set()
for k in self.__parentLocale.keys():
for f in k.split('_'):
@ -492,7 +496,7 @@ enumdata.py (keeping the old name as an alias):
+ '\n')
grumble('\n')
def bcp47Aliases(self):
def bcp47Aliases(self) -> tuple[dict[str, str], dict[str, str]]:
"""Reads the mapping from CLDR IDs to IANA IDs
CLDR identifies timezones in various ways but its standard
@ -530,7 +534,8 @@ enumdata.py (keeping the old name as an alias):
# If we ever need a mapping back to CLDR ID, we can make
# (description, space-joined-list) the naming values.
alias, naming = {}, {} # { alias: iana }, { iana: description }
alias: dict[str, str] = {} # { alias: iana }
naming: dict[str, str] = {} # { iana: description }
for item, attrs in root.find('keyword/key/type', exclude=('deprecated',)):
assert 'description' in attrs, item
assert 'alias' in attrs, item
@ -545,7 +550,8 @@ enumdata.py (keeping the old name as an alias):
return alias, naming
def readWindowsTimeZones(self, alias):
def readWindowsTimeZones(self, alias: dict[str, str]) -> tuple[dict[str, str],
list[tuple[str, str, str]]]:
"""Digest CLDR's MS-Win time-zone name mapping.
Single argument, alias, should be the first part of the pair
@ -582,7 +588,8 @@ enumdata.py (keeping the old name as an alias):
mapZone element and the last is s, its cleaned-up list of IANA
IDs."""
defaults, windows = {}, []
defaults: dict[str, str] = {}
windows: list[tuple[str, str, str]] = []
zones = self.supplement('windowsZones.xml')
for name, attrs in zones.find('windowsZones/mapTimezones'):
if name != 'mapZone':
@ -602,7 +609,10 @@ enumdata.py (keeping the old name as an alias):
return defaults, windows
def readMetaZoneMap(self, alias):
def readMetaZoneMap(self, alias: dict[str, str]
) -> tuple[dict[str, dict[str, str]],
dict[str, tuple[tuple[int, int, str], ...]],
dict[str, str]]:
"""Digests the metaZones supplemental data.
Required argument, alias, should be the first of
@ -633,9 +643,9 @@ enumdata.py (keeping the old name as an alias):
locale."""
metaZones = self.supplement('metaZones.xml') # Doesn't appear to use draft attribute
# Map CLDR name to IANA name (or use CLDR name if unknown to alias):
zoneName = lambda n, g=alias.get: g(n, n)
zoneName: Callable[[str], str] = lambda n, g=alias.get: g(n, n)
metaMap = {} # { meta: { territory code: zoneId } }
metaMap: dict[str, dict[str, str]] = {} # { meta: { territory code: zoneId } }
# Entry with territory 001 is "golden zone" for the metazone.
for mapMeta in metaZones.findNodes('metaZones/mapTimezones'):
attrs = mapMeta.attributes()
@ -646,13 +656,13 @@ enumdata.py (keeping the old name as an alias):
raise Error('Version of metazone map type is not 2018e', attrs)
for node in mapMeta.findAllChildren('mapZone'):
attrs = node.attributes()
attrs: dict[str, str] = node.attributes()
try:
meta, code, zone = attrs['other'], attrs['territory'], attrs['type']
except KeyError:
continue
bok = metaMap.setdefault(meta, {})
bok: dict[str, str] = metaMap.setdefault(meta, {})
assert code not in bok, (meta, code)
bok[code] = zoneName(zone)
# Territories not named in a metaMap entry fall back on the
@ -660,16 +670,16 @@ enumdata.py (keeping the old name as an alias):
# entry:
assert all('001' in bok for bok in metaMap.values())
def scanUses(zone, check=metaMap):
def scanUses(zone: Node, check=metaMap) -> Iterator[tuple[str|None, str|None, str]]:
for node in zone.findAllChildren('usesMetazone'):
attrs = node.attributes()
mzone = attrs['mzone']
attrs: dict[str, str] = node.attributes()
mzone: str = attrs['mzone']
if mzone not in check:
raise Error('Unknown metazone', mzone)
# These are UTC date-times.
yield attrs.get('from'), attrs.get('to'), mzone
def sortKey(triple):
def sortKey(triple: tuple[str|None, str|None, str]) -> str | None:
start, stop, mzone = triple
# The start = None entry should sort first; since its key
# is its stop, which is likely the next entry's start, we
@ -680,11 +690,11 @@ enumdata.py (keeping the old name as an alias):
# in the list, so the sorting is fatuous and the key
# doesn't matter).
def timeRep(text, notime, epoch=datetime(1970, 1, 1, 0, 0)):
def timeRep(text: str, notime: bool, epoch=datetime(1970, 1, 1, 0, 0)) -> int:
"""Map a 'yyyy-MM-dd HH:mm' string to epoch minutes.
If the HH:mm part is omitted, second parameter notime is true to
use the end of the day, false for the start. LDM specifies this
use the end of the day, false for the start. LDML specifies this
reading of the pure-date values for start and stop attributes. If
the HH:mm part is 24:00, the end of the day is also used; LDML
specifies this but python's datetime.fromisoformat() doesn't like
@ -704,16 +714,20 @@ enumdata.py (keeping the old name as an alias):
assert len(text) == 16, text
# If it's given with HH:mm as 24:00, this throws:
diff = datetime.fromisoformat(text) - epoch
diff: timedelta = datetime.fromisoformat(text) - epoch
except ValueError:
diff = datetime.fromisoformat(text[:10]) - epoch
diff += diff.__class__(days=1)
assert diff.days >= 0 and diff.seconds >= 0, (diff, text)
assert diff.seconds % 60 == 0, (diff, text)
return diff.days * 1440 + int(diff.seconds / 60)
mins, secs = divmod(diff.seconds, 60)
assert secs == 0, (diff, text)
return diff.days * 1440 + mins
def mapTimes(triple, alpha=0, omega=(1<<32)-1, torep=timeRep):
def mapTimes(triple: tuple[str|None, str|None, str],
alpha: int = 0, omega: int = (1<<32) - 1,
torep: Callable[[str, bool, datetime], int] = timeRep
) -> tuple[int, int, str]:
start, stop, mzone = triple
start = alpha if start is None else torep(start, False)
stop = omega if stop is None else torep(stop, True)
@ -723,10 +737,11 @@ enumdata.py (keeping the old name as an alias):
stop = omega
return start, stop, mzone
zones = {} # { ianaId: ( (from, to, meta), ... ) }
# zones is { ianaId: ( (from, to, meta), ... ) }
zones: dict[str, tuple[tuple[int, int, str], ...]] = {}
for metaInfo in metaZones.findNodes('metaZones/metazoneInfo'):
for zone in metaInfo.findAllChildren('timezone'):
iana = zoneName(zone.dom.attributes['type'].value)
iana: str = zoneName(zone.dom.attributes['type'].value)
story = tuple(sorted(scanUses(zone), key=sortKey))
# Only {first,last} entry can have None for {from,to}:
assert not any(s[0] is None for s in story[1:]), (iana, story)
@ -743,7 +758,7 @@ enumdata.py (keeping the old name as an alias):
for zone in bok.values())
for metaz, bok in metaMap.items())
territorial = {} # { territory code: IANA ID }
territorial: dict[str, str] = {} # { territory code: IANA ID }
for prime in metaZones.findNodes('primaryZones/primaryZone'):
code = prime.attributes()['iso3166']
assert code not in territorial, code
@ -752,36 +767,36 @@ enumdata.py (keeping the old name as an alias):
return metaMap, zones, territorial
@property
def cldrVersion(self):
def cldrVersion(self) -> str:
# Evaluate so as to ensure __cldrVersion is set:
self.__unDistinguishedAttributes
return self.__cldrVersion
# Implementation details
def __xml(self, relative_path: str, cache = CacheDict(), read = minidom.parse):
def __xml(self, relPath: str, cache = CacheDict(), read = minidom.parse) -> minidom.Element:
try:
doc = cache[relative_path]
doc: minidom.Element = cache[relPath]
except KeyError:
cache[relative_path] = doc = read(str(self.root.joinpath(relative_path))).documentElement
cache[relPath] = doc = read(str(self.root.joinpath(relPath))).documentElement
return doc
def __open(self, relative_path: str) -> TextIO:
return self.root.joinpath(relative_path).open()
@property
def __rootLocale(self, cache = []):
def __rootLocale(self, cache: list[XmlScanner] = []) -> XmlScanner:
if not cache:
cache.append(self.xml('common/main/root.xml'))
return cache[0]
@property
def __supplementalData(self, cache = []):
def __supplementalData(self, cache: list[Supplement] = []) -> Supplement:
if not cache:
cache.append(self.supplement('supplementalData.xml'))
return cache[0]
@property
def __numberSystems(self, cache = {}):
def __numberSystems(self, cache: dict[str, dict[str, str]] = {}) -> dict[str, dict[str, str]]:
if not cache:
for ignore, attrs in self.supplement('numberingSystems.xml').find('numberingSystems'):
cache[attrs['id']] = attrs
@ -789,20 +804,22 @@ enumdata.py (keeping the old name as an alias):
return cache
@property
def __weekData(self, cache = {}):
def __weekData(self, cache: dict[str, tuple[str, str, str]] = {}
) -> dict[str, tuple[str, str, str]]:
if not cache:
# firstDay, weStart and weEnd are all dict[str, str]
firstDay, weStart, weEnd = self.__getWeekData()
# Massage those into an easily-consulted form:
# World defaults given for code '001':
mon, sat, sun = firstDay['001'], weStart['001'], weEnd['001']
lands = set(firstDay) | set(weStart) | set(weEnd)
lands: set[str] = set(firstDay) | set(weStart) | set(weEnd)
cache.update((land,
(firstDay.get(land, mon), weStart.get(land, sat), weEnd.get(land, sun)))
for land in lands)
assert cache
return cache
def __getWeekData(self):
def __getWeekData(self) -> Iterator[dict[str, str]]:
"""Scan for data on the weekly cycle.
Yields three mappings from locales to en's short names for
@ -811,12 +828,12 @@ enumdata.py (keeping the old name as an alias):
gives the day on which the week starts, the second gives the
day on which the week-end starts, the third gives the last day
of the week-end."""
source = self.__supplementalData
source: Supplement = self.__supplementalData
for key in ('firstDay', 'weekendStart', 'weekendEnd'):
result = {}
result: dict[str, str] = {}
for ignore, attrs in source.find(f'weekData/{key}'):
assert ignore == key
day = attrs['day']
day: str = attrs['day']
assert day in ('mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun'), day
if 'alt' in attrs:
continue
@ -825,7 +842,8 @@ enumdata.py (keeping the old name as an alias):
yield result
@property
def __currencyData(self, cache = {}):
def __currencyData(self, cache: dict[str, tuple[str, int, int]] = {}
) -> dict[str, tuple[str, int, int]]:
if not cache:
source = self.__supplementalData
for elt in source.findNodes('currencyData/region'):
@ -850,15 +868,16 @@ enumdata.py (keeping the old name as an alias):
if iso:
for tag, data in source.find(
f'currencyData/fractions/info[iso4217={iso}]'):
digits = data['digits']
rounding = data['rounding']
digits = int(data['digits'])
rounding = int(data['rounding'])
cache[territory] = iso, digits, rounding
assert cache
return cache
@property
def __unDistinguishedAttributes(self, cache = {}):
def __unDistinguishedAttributes(self, cache: dict[str, tuple[str, ...]] = {}
) -> dict[str, tuple[str, ...]]:
"""Mapping from tag names to lists of attributes.
LDML defines some attributes as 'distinguishing': if a node
@ -878,7 +897,7 @@ enumdata.py (keeping the old name as an alias):
return cache
def __scanLdmlDtd(self):
def __scanLdmlDtd(self) -> Iterator[tuple[str, tuple[str, ...]]]:
"""Scan the LDML DTD, record CLDR version
Yields (tag, attrs) pairs: on elements with a given tag,
@ -920,7 +939,8 @@ enumdata.py (keeping the old name as an alias):
if tag and ignored:
yield tag, tuple(ignored)
def __enumMap(self, key, cache = {}):
def __enumMap(self, key: str, cache: dict[str, dict[str, tuple[int, str]]] = {}
) -> dict[str, tuple[int, str]]:
if not cache:
cache['variant'] = {'': (0, 'This should never be seen outside ldml.py')}
# They're mappings from numeric value to pairs of full
@ -943,19 +963,19 @@ enumdata.py (keeping the old name as an alias):
return cache[key]
def __codeMap(self, key, cache = {},
def __codeMap(self, key: str, cache: dict[str, dict[str, str]] = {},
# Maps our name for it to CLDR's name:
naming = {'language': 'languages', 'script': 'scripts',
'territory': 'territories', 'variant': 'variants'}):
'territory': 'territories', 'variant': 'variants'}) -> dict[str, str]:
if not cache:
root = self.xml('common/main/en.xml').root.findUniqueChild('localeDisplayNames')
root: Node = self.xml('common/main/en.xml').root.findUniqueChild('localeDisplayNames')
for dst, src in naming.items():
cache[dst] = dict(self.__codeMapScan(root.findUniqueChild(src)))
assert cache
return cache[key]
def __codeMapScan(self, node):
def __codeMapScan(self, node: Node) -> Iterator[tuple[str, str]]:
"""Get mapping from codes to element values.
Passed in node is a <languages>, <scripts>, <territories> or
@ -986,23 +1006,23 @@ enumdata.py (keeping the old name as an alias):
# CLDR uses inheritance between locales to save repetition:
@property
def __parentLocale(self, cache = {}):
def __parentLocale(self, cache: dict[str, str] = {}) -> dict[str, str]:
# see http://www.unicode.org/reports/tr35/#Parent_Locales
if not cache:
for tag, attrs in self.__supplementalData.find('parentLocales',
('component',)):
parent = attrs.get('parent', '')
parent: str = attrs.get('parent', '')
for child in attrs['locales'].split():
cache[child] = parent
assert cache
return cache
def __scanLocaleRoots(self, name: str):
def __scanLocaleRoots(self, name: str) -> Iterator[Node]:
while name and name != 'root':
path = f'common/main/{name}.xml'
if self.root.joinpath(path).exists():
elt = self.__xml(path) # which has no top-level alias children:
elt: minidom.Element = self.__xml(path) # which has no top-level alias children:
assert not any(True
for child in Node(elt).findAllChildren(
'alias', allDull=True)
@ -1019,11 +1039,11 @@ enumdata.py (keeping the old name as an alias):
break
class __Seq (list): pass # No weakref for tuple and list, but list sub-class is ok.
def __localeRoots(self, name, cache = CacheDict()):
def __localeRoots(self, name: str, cache = CacheDict()) -> __Seq:
try:
chain = cache[name]
chain: CldrAccess.__Seq = cache[name]
except KeyError:
cache[name] = chain = self.__Seq(self.__scanLocaleRoots(name))
cache[name] = chain = CldrAccess.__Seq(self.__scanLocaleRoots(name))
return chain
# Unpolute the namespace: we don't need to export these.

View File

@ -64,7 +64,7 @@ def wrap_list(lst, perline=20):
yield head
return ",\n".join(", ".join(x) for x in split(lst, perline))
def names_clash(cldr, enum):
def names_clash(cldr: str, enum: str) -> None | str:
"""True if the reader might not recognize cldr as the name of enum
First argument, cldr, is the name CLDR gives for some language,