Suggest name, when available, for unknown codes

When parsing the CLDR data, we only handle language, script and
territory (which we call country) codes if they are known to our
enumdata.py tables.  When reporting the rest as unknown, in the
content of an actual locale definition (not the likely subtag data),
check whether en.xml can resolve the code for us; if it can, report
the full name it provides, as a hint to whoever's running the script
that an update to enumdata.py may be in order.

Change-Id: I9ca1d6922a91d45bc436f4b622e5557261897d7f
Reviewed-by: Thiago Macieira <thiago.macieira@intel.com>
Reviewed-by: Lars Knoll <lars.knoll@qt.io>
Reviewed-by: Konstantin Ritt <ritt.ks@gmail.com>
This commit is contained in:
Edward Welbourne 2019-05-08 15:20:30 +02:00
parent 248b6756da
commit b7d8169f02
2 changed files with 57 additions and 5 deletions

View File

@ -95,6 +95,34 @@ def parse_number_format(patterns, data):
result.append(pattern)
return result
def raiseUnknownCode(code, form, cache={}):
"""Check whether an unknown code could be supported.
We declare a language, script or country code unknown if it's not
known to enumdata.py; however, if it's present in main/en.xml's
mapping of codes to names, we have the option of adding support.
This caches the necessary look-up (so we only read main/en.xml
once) and returns the name we should use if we do add support.
First parameter, code, is the unknown code. Second parameter,
form, is one of 'language', 'script' or 'country' to select the
type of code to look up. Do not pass further parameters (the next
will deprive you of the cache).
Raises xpathlite.Error with a suitable message, that includes the
unknown code's full name if found.
Relies on global cldr_dir being set before it's called; see tail
of this file.
"""
if not cache:
cache.update(xpathlite.codeMapsFromFile(os.path.join(cldr_dir, 'en.xml')))
name = cache[form].get(code)
msg = 'unknown %s code "%s"' % (form, code)
if name:
msg += ' - could use "%s"' % name
raise xpathlite.Error(msg)
def parse_list_pattern_part_format(pattern):
# This is a very limited parsing of the format for list pattern part only.
return pattern.replace("{0}", "%1").replace("{1}", "%2").replace("{2}", "%3")
@ -193,18 +221,18 @@ def _generateLocaleInfo(path, language_code, script_code, country_code, variant_
language_id = enumdata.languageCodeToId(language_code)
if language_id <= 0:
raise xpathlite.Error('unknown language code "%s"' % language_code)
raiseUnknownCode(language_code, 'language')
script_id = enumdata.scriptCodeToId(script_code)
if script_id == -1:
raise xpathlite.Error('unknown script code "%s"' % script_code)
raiseUnknownCode(script_code, 'script')
# we should handle fully qualified names with the territory
if not country_code:
return {}
country_id = enumdata.countryCodeToId(country_code)
if country_id <= 0:
raise xpathlite.Error('unknown country code "%s"' % country_code)
raiseUnknownCode(country_code, 'country')
# So we say we accept only those values that have "contributed" or
# "approved" resolution. see http://www.unicode.org/cldr/process.html

View File

@ -78,14 +78,38 @@ def findChild(parent, tag_name, arg_name=None, arg_value=None, draft=None):
return node
return False
def codeMapsFromFile(file):
"""Extract mappings of language, script and country codes to names.
The file shall typically be common/main/en.xml, which contains a
localeDisplayNames element with children languages, scripts and
territories; each element in each of these has a code as its type
attribute and its name as element content. This returns a mapping
withe keys 'language', 'script' and 'country', each of which
has, as value, a mapping of the relevant codes to names.
"""
parent = findChild(findChild(parseDoc(file), 'ldml'), 'localeDisplayNames')
keys, result = {'languages': 'language', 'scripts': 'script', 'territories': 'country'}, {}
for src, dst in keys.items():
child = findChild(parent, src)
data = result[dst] = {}
for elt in child.childNodes:
if elt.attributes and elt.attributes.has_key('type'):
key, value = elt.attributes['type'].value, elt.childNodes[0].wholeText
# Don't over-write previously-read data for an alt form:
if elt.attributes.has_key('alt') and data.has_key(key):
continue
data[key] = value
return result
def findTagsInFile(file, path):
doc = parseDoc(file)
elt = doc.documentElement
tag_spec_list = path.split("/")
last_entry = None
for i in range(len(tag_spec_list)):
tag_spec = tag_spec_list[i]
for tag_spec in tag_spec_list:
tag_name = tag_spec
arg_name = 'type'
arg_value = ''