Fix handling of default content locale data

We iterate theoretically many sources (albeit there's only really ever
one) for this data, so accumulate instead of only keeping the last
(and initialize it as the list it always ended up being, not a
dictionary, so that this can work).

The form of each token is a locale name, so it may be lang_Script just
as readily as lang_LAND; so parse (and validate) the tags more
faithfully to catch both cases.  Abstract that parsing into a function
and use it in both places that need it.

Change-Id: Ibdbc4eafefab6a5ef70646d9fea150f2cb408d05
Reviewed-by: Jędrzej Nowacki <jedrzej.nowacki@qt.io>
This commit is contained in:
Edward Welbourne 2018-08-13 15:21:58 +02:00
parent bbee6fac33
commit c1b15005c6

View File

@ -438,6 +438,36 @@ def integrateWeekData(filePath):
else:
locale.weekendEnd = weekendEndByCountryCode["001"]
def splitLocale(name):
"""Split name into (language, script, territory) triple as generator.
Ignores any trailing fields, leaves script or territory empty if
unspecified, returns empty generator if no language found."""
tags = iter(name.split('_'))
yield tags.next() # Language
tag = tags.next()
# Script is always four letters, always capitalised:
if len(tag) == 4 and tag[0].isupper() and tag[1:].islower():
yield tag
try:
tag = tags.next()
except StopIteration:
tag = ''
else:
yield ''
# Territory is upper-case or numeric:
if tag and tag.isupper() or tag.isdigit():
yield tag
tag = ''
else:
yield ''
# If nothing is left, StopIteration will avoid the warning:
tag = (tag if tag else tags.next(),)
sys.stderr.write('Ignoring unparsed cruft %s in %s\n' % ('_'.join(tag + tuple(tags)), name))
if len(sys.argv) != 2:
usage()
@ -451,30 +481,25 @@ cldr_files = os.listdir(cldr_dir)
locale_database = {}
# see http://www.unicode.org/reports/tr35/tr35-info.html#Default_Content
defaultContent_locales = {}
defaultContent_locales = []
for ns in findTagsInFile(os.path.join(cldr_dir, '..', 'supplemental',
'supplementalMetadata.xml'),
'metadata/defaultContent'):
for data in ns[1:][0]:
if data[0] == u"locales":
defaultContent_locales = data[1].split()
defaultContent_locales += data[1].split()
for file in defaultContent_locales:
items = file.split("_")
if len(items) == 3:
language_code = items[0]
script_code = items[1]
country_code = items[2]
else:
if len(items) != 2:
sys.stderr.write('skipping defaultContent locale "' + file + '" [neither lang_script_country nor lang_country]\n')
continue
language_code = items[0]
script_code = ""
country_code = items[1]
if len(country_code) == 4:
sys.stderr.write('skipping defaultContent locale "' + file + '" [long country code]\n')
continue
try:
language_code, script_code, country_code = splitLocale(file)
except ValueError:
sys.stderr.write('skipping defaultContent locale "' + file + '" [neither two nor three tags]\n')
continue
if not (script_code or country_code):
sys.stderr.write('skipping defaultContent locale "' + file + '" [second tag is neither script nor territory]\n')
continue
try:
l = _generateLocaleInfo(cldr_dir + "/" + file + ".xml", language_code, script_code, country_code)
if not l:
@ -548,31 +573,31 @@ def _parseLocale(l):
if l == "und":
raise xpathlite.Error("we are treating unknown locale like C")
items = l.split("_")
language_code = items[0]
parsed = splitLocale(l)
language_code = parsed.next()
script_code = country_code = ''
try:
script_code, country_code = parsed
except ValueError:
pass
if language_code != "und":
language_id = enumdata.languageCodeToId(language_code)
if language_id == -1:
raise xpathlite.Error('unknown language code "%s"' % language_code)
language = enumdata.language_list[language_id][0]
if len(items) > 1:
script_code = items[1]
country_code = ""
if len(items) > 2:
country_code = items[2]
if len(script_code) == 4:
script_id = enumdata.scriptCodeToId(script_code)
if script_id == -1:
raise xpathlite.Error('unknown script code "%s"' % script_code)
script = enumdata.script_list[script_id][0]
else:
country_code = script_code
if country_code:
country_id = enumdata.countryCodeToId(country_code)
if country_id == -1:
raise xpathlite.Error('unknown country code "%s"' % country_code)
country = enumdata.country_list[country_id][0]
if script_code:
script_id = enumdata.scriptCodeToId(script_code)
if script_id == -1:
raise xpathlite.Error('unknown script code "%s"' % script_code)
script = enumdata.script_list[script_id][0]
if country_code:
country_id = enumdata.countryCodeToId(country_code)
if country_id == -1:
raise xpathlite.Error('unknown country code "%s"' % country_code)
country = enumdata.country_list[country_id][0]
return (language, script, country)