Rework cldr2qlocalexml.py in terms of a QLocaleXmlWriter class

Delegate the output of XML to a helper class provided by qlocalexml.py
and restructure the driver script so that it can be imported without
running anything. It now has a minimal __name__ == '__main__' block
that calls a main() function. This, for the moment, requires a global
via which it shares the CLDR directory with various other functions;
that shall go away in a later commit.

Task-number: QTBUG-81344
Change-Id: Ica2d3ec09f2d38ba42fd930258cc765283f29a71
Reviewed-by: Cristian Maureira-Fredes <cristian.maureira-fredes@qt.io>
This commit is contained in:
Edward Welbourne 2020-02-19 15:17:16 +01:00 committed by Edward Welbourne
parent 54886d7f81
commit a20697a394
2 changed files with 327 additions and 194 deletions

View File

@ -61,13 +61,13 @@ import enumdata
import xpathlite
from xpathlite import DraftResolution, findAlias, findEntry, findTagsInFile
from dateconverter import convert_date
from qlocalexml import Locale
from qlocalexml import Locale, QLocaleXmlWriter
# TODO: make calendars a command-line option
calendars = ['gregorian', 'persian', 'islamic'] # 'hebrew'
findEntryInFile = xpathlite._findEntryInFile
def wrappedwarn(prefix, tokens):
return sys.stderr.write(
def wrappedwarn(err, prefix, tokens):
return err.write(
'\n'.join(textwrap.wrap(prefix + ', '.join(tokens),
subsequent_indent=' ', width=80)) + '\n')
@ -101,6 +101,7 @@ def parse_number_format(patterns, data):
result.append(pattern)
return result
cldr_dir = None
def raiseUnknownCode(code, form, cache={}):
"""Check whether an unknown code could be supported.
@ -193,8 +194,8 @@ def getNumberSystems(cache={}):
"""Cached look-up of number system information.
Pass no arguments. Returns a mapping from number system names to,
for each system, a mapping with keys u'digits', u'type' and
u'id'\n"""
for each system, a mapping with keys 'digits', 'type' and 'id'.
Relies on global cldr_dir being set before it's first called.\n"""
if not cache:
for ns in findTagsInFile(os.path.join(cldr_dir, '..', 'supplemental',
'numberingSystems.xml'),
@ -419,26 +420,7 @@ def _generateLocaleInfo(path, language_code, script_code, country_code, variant_
return Locale(result)
def addEscapes(s):
result = ''
for c in s:
n = ord(c)
if n < 128:
result += c
else:
result += "\\x"
result += "%02x" % (n)
return result
def unicodeStr(s):
utf8 = s.encode('utf-8')
return "<size>" + str(len(utf8)) + "</size><data>" + addEscapes(utf8) + "</data>"
def usage():
print "Usage: cldr2qlocalexml.py <path-to-cldr-main>"
sys.exit()
def integrateWeekData(filePath):
def integrateWeekData(filePath, locale_database):
if not filePath.endswith(".xml"):
return {}
@ -510,111 +492,6 @@ def splitLocale(name):
tag = (tag if tag else tags.next(),)
sys.stderr.write('Ignoring unparsed cruft %s in %s\n' % ('_'.join(tag + tuple(tags)), name))
if len(sys.argv) != 2:
usage()
cldr_dir = sys.argv[1]
if not os.path.isdir(cldr_dir):
usage()
cldr_files = os.listdir(cldr_dir)
locale_database = {}
# see http://www.unicode.org/reports/tr35/tr35-info.html#Default_Content
defaultContent_locales = []
for ns in findTagsInFile(os.path.join(cldr_dir, '..', 'supplemental',
'supplementalMetadata.xml'),
'metadata/defaultContent'):
for data in ns[1:][0]:
if data[0] == u"locales":
defaultContent_locales += data[1].split()
skips = []
for file in defaultContent_locales:
try:
language_code, script_code, country_code = splitLocale(file)
except ValueError:
sys.stderr.write('skipping defaultContent locale "' + file + '" [neither two nor three tags]\n')
continue
if not (script_code or country_code):
sys.stderr.write('skipping defaultContent locale "' + file + '" [second tag is neither script nor territory]\n')
continue
try:
l = _generateLocaleInfo(cldr_dir + "/" + file + ".xml", language_code, script_code, country_code)
if not l:
skips.append(file)
continue
except xpathlite.Error as e:
sys.stderr.write('skipping defaultContent locale "%s" (%s)\n' % (file, str(e)))
continue
locale_database[(l.language_id, l.script_id, l.country_id, l.variant_code)] = l
if skips:
wrappedwarn('skipping defaultContent locales [no locale info generated]: ', skips)
skips = []
for file in cldr_files:
try:
l = generateLocaleInfo(cldr_dir + "/" + file)
if not l:
skips.append(file)
continue
except xpathlite.Error as e:
sys.stderr.write('skipping file "%s" (%s)\n' % (file, str(e)))
continue
locale_database[(l.language_id, l.script_id, l.country_id, l.variant_code)] = l
if skips:
wrappedwarn('skipping files [no locale info generated]: ', skips)
integrateWeekData(cldr_dir+"/../supplemental/supplementalData.xml")
locale_keys = locale_database.keys()
locale_keys.sort()
cldr_version = 'unknown'
ldml = open(cldr_dir+"/../dtd/ldml.dtd", "r")
for line in ldml:
if 'version cldrVersion CDATA #FIXED' in line:
cldr_version = line.split('"')[1]
print "<localeDatabase>"
print " <version>" + cldr_version + "</version>"
print " <languageList>"
for id in enumdata.language_list:
l = enumdata.language_list[id]
print " <language>"
print " <name>" + l[0] + "</name>"
print " <id>" + str(id) + "</id>"
print " <code>" + l[1] + "</code>"
print " </language>"
print " </languageList>"
print " <scriptList>"
for id in enumdata.script_list:
l = enumdata.script_list[id]
print " <script>"
print " <name>" + l[0] + "</name>"
print " <id>" + str(id) + "</id>"
print " <code>" + l[1] + "</code>"
print " </script>"
print " </scriptList>"
print " <countryList>"
for id in enumdata.country_list:
l = enumdata.country_list[id]
print " <country>"
print " <name>" + l[0] + "</name>"
print " <id>" + str(id) + "</id>"
print " <code>" + l[1] + "</code>"
print " </country>"
print " </countryList>"
def _parseLocale(l):
language = "AnyLanguage"
script = "AnyScript"
@ -651,48 +528,135 @@ def _parseLocale(l):
return (language, script, country)
skips = []
print " <likelySubtags>"
for ns in findTagsInFile(cldr_dir + "/../supplemental/likelySubtags.xml", "likelySubtags"):
tmp = {}
for data in ns[1:][0]: # ns looks like this: [u'likelySubtag', [(u'from', u'aa'), (u'to', u'aa_Latn_ET')]]
tmp[data[0]] = data[1]
def likelySubtags(root, err):
skips = []
for ns in findTagsInFile(os.path.join(root, 'supplemental', 'likelySubtags.xml'), "likelySubtags"):
tmp = {}
for data in ns[1:][0]: # ns looks like this: [u'likelySubtag', [(u'from', u'aa'), (u'to', u'aa_Latn_ET')]]
tmp[data[0]] = data[1]
try:
from_language, from_script, from_country = _parseLocale(tmp[u"from"])
to_language, to_script, to_country = _parseLocale(tmp[u"to"])
except xpathlite.Error as e:
if tmp[u'to'].startswith(tmp[u'from']) and str(e) == 'unknown language code "%s"' % tmp[u'from']:
skips.append(tmp[u'to'])
else:
sys.stderr.write('skipping likelySubtag "%s" -> "%s" (%s)\n' % (tmp[u"from"], tmp[u"to"], str(e)))
continue
# substitute according to http://www.unicode.org/reports/tr35/#Likely_Subtags
if to_country == "AnyCountry" and from_country != to_country:
to_country = from_country
if to_script == "AnyScript" and from_script != to_script:
to_script = from_script
try:
from_language, from_script, from_country = _parseLocale(tmp[u"from"])
to_language, to_script, to_country = _parseLocale(tmp[u"to"])
except xpathlite.Error as e:
if tmp[u'to'].startswith(tmp[u'from']) and str(e) == 'unknown language code "%s"' % tmp[u'from']:
skips.append(tmp[u'to'])
else:
sys.stderr.write('skipping likelySubtag "%s" -> "%s" (%s)\n' % (tmp[u"from"], tmp[u"to"], str(e)))
continue
# substitute according to http://www.unicode.org/reports/tr35/#Likely_Subtags
if to_country == "AnyCountry" and from_country != to_country:
to_country = from_country
if to_script == "AnyScript" and from_script != to_script:
to_script = from_script
print " <likelySubtag>"
print " <from>"
print " <language>" + from_language + "</language>"
print " <script>" + from_script + "</script>"
print " <country>" + from_country + "</country>"
print " </from>"
print " <to>"
print " <language>" + to_language + "</language>"
print " <script>" + to_script + "</script>"
print " <country>" + to_country + "</country>"
print " </to>"
print " </likelySubtag>"
print " </likelySubtags>"
if skips:
wrappedwarn('skipping likelySubtags (for unknown language codes): ', skips)
print " <localeList>"
yield ((from_language, from_script, from_country),
(to_language, to_script, to_country))
if skips:
wrappedwarn(err, 'skipping likelySubtags (for unknown language codes): ', skips)
Locale.C(calendars).toXml(calendars)
for key in locale_keys:
locale_database[key].toXml(calendars)
def usage(err, name, message = ''):
err.write("""Usage: {} <path-to-cldr-main> [out-file.xml]
""".format(name)) # TODO: expand
if message:
err.write('\n' + message + '\n')
print " </localeList>"
print "</localeDatabase>"
def main(args, out, err):
name = args.pop(0)
if len(args) < 1:
usage(err, name)
return 1
global cldr_dir
cldr_dir = args.pop(0)
if not os.path.isdir(cldr_dir):
usage(err, name, 'Where did you unpack the CLDR data files ?')
return 1
if len(args) > 1:
usage(err, name, 'Too many arguments passed')
return 1
if args:
qxml = open(args.pop(0), 'w')
else:
qxml = out
getNumberSystems(cldr_dir)
cldr_files = os.listdir(cldr_dir)
locale_database = {}
# see http://www.unicode.org/reports/tr35/tr35-info.html#Default_Content
defaultContent_locales = []
for ns in findTagsInFile(os.path.join(cldr_dir, '..', 'supplemental',
'supplementalMetadata.xml'),
'metadata/defaultContent'):
for data in ns[1:][0]:
if data[0] == u"locales":
defaultContent_locales += data[1].split()
skips = []
for file in defaultContent_locales:
try:
language_code, script_code, country_code = splitLocale(file)
except ValueError:
sys.stderr.write('skipping defaultContent locale "' + file + '" [neither two nor three tags]\n')
continue
if not (script_code or country_code):
sys.stderr.write('skipping defaultContent locale "' + file + '" [second tag is neither script nor territory]\n')
continue
try:
l = _generateLocaleInfo(cldr_dir + "/" + file + ".xml", language_code, script_code, country_code)
if not l:
skips.append(file)
continue
except xpathlite.Error as e:
sys.stderr.write('skipping defaultContent locale "{}" ({})\n'.format(file, str(e)))
continue
locale_database[(l.language_id, l.script_id, l.country_id, l.variant_code)] = l
if skips:
wrappedwarn(err, 'skipping defaultContent locales [no locale info generated]: ', skips)
skips = []
for file in cldr_files:
try:
l = generateLocaleInfo(cldr_dir + "/" + file)
if not l:
skips.append(file)
continue
except xpathlite.Error as e:
sys.stderr.write('skipping file "{}" ({})\n'.format(file, str(e)))
continue
locale_database[(l.language_id, l.script_id, l.country_id, l.variant_code)] = l
if skips:
wrappedwarn(err, 'skipping files [no locale info generated]: ', skips)
integrateWeekData(cldr_dir + "/../supplemental/supplementalData.xml", locale_database)
cldr_version = 'unknown'
with open(cldr_dir+"/../dtd/ldml.dtd", "r") as ldml:
for line in ldml:
if 'version cldrVersion CDATA #FIXED' in line:
cldr_version = line.split('"')[1]
xmlOut = QLocaleXmlWriter(qxml.write)
xmlOut.version(cldr_version)
xmlOut.enumData(enumdata.language_list,
enumdata.script_list,
enumdata.country_list)
xmlOut.likelySubTags(likelySubtags(os.path.split(cldr_dir)[0], err))
xmlOut.locales(locale_database, calendars)
xmlOut.close()
if qxml is not out:
qxml.close()
return 0
if __name__ == '__main__':
import sys
sys.exit(main(sys.argv, sys.stdout, sys.stderr))

View File

@ -1,7 +1,7 @@
# coding=utf8
#############################################################################
##
## Copyright (C) 2018 The Qt Company Ltd.
## Copyright (C) 2020 The Qt Company Ltd.
## Contact: https://www.qt.io/licensing/
##
## This file is part of the test suite of the Qt Toolkit.
@ -28,11 +28,17 @@
#############################################################################
"""Shared serialization-scanning code for QLocaleXML format.
The Locale class is written by cldr2qlocalexml.py and read by qlocalexml2cpp.py
Provides classes:
Locale -- common data-type representing one locale as a namespace
QLocaleXmlWriter -- helper to write a QLocaleXML file
Support:
Spacer -- provides control over indentation of the output.
"""
from __future__ import print_function
from xml.sax.saxutils import escape
import xpathlite
from xpathlite import Error
# Tools used by Locale:
def camel(seq):
@ -43,10 +49,14 @@ def camel(seq):
def camelCase(words):
return ''.join(camel(iter(words)))
def addEscapes(s):
return ''.join(c if n < 128 else '\\x{:02x}'.format(n)
for n, c in ((ord(c), c) for c in s))
def ordStr(c):
if len(c) == 1:
return str(ord(c))
raise xpathlite.Error('Unable to handle value "%s"' % addEscapes(c))
raise Error('Unable to handle value "{}"'.format(addEscapes(c)))
# Fix for a problem with QLocale returning a character instead of
# strings for QLocale::exponential() and others. So we fallback to
@ -69,6 +79,8 @@ def convertFormat(format):
* https://www.unicode.org/reports/tr35/tr35-dates.html#Date_Field_Symbol_Table
* QDateTimeParser::parseFormat() and QLocalePrivate::dateTimeToString()
"""
# Compare and contrast dateconverter.py's convert_date().
# Need to (check consistency and) reduce redundancy !
result = ""
i = 0
while i < len(format):
@ -113,7 +125,163 @@ def convertFormat(format):
return result
class Locale:
class Spacer (object):
def __init__(self, indent = None, initial = ''):
"""Prepare to manage indentation and line breaks.
Arguments are both optional.
First argument, indent, is either None (its default, for
'minifying'), an ingeter (number of spaces) or the unit of
text that is to be used for each indentation level (e.g. '\t'
to use tabs). If indent is None, no indentation is added, nor
are line-breaks; otherwise, self(text), for non-empty text,
shall end with a newline and begin with indentation.
Second argument, initial, is the initial indentation; it is
ignored if indent is None. Indentation increases after each
call to self(text) in which text starts with a tag and doesn't
include its end-tag; indentation decreases if text starts with
an end-tag. The text is not parsed any more carefully than
just described.
"""
if indent is None:
self.__call = lambda x: x
else:
self.__each = ' ' * indent if isinstance(indent, int) else indent
self.current = initial
self.__call = self.__wrap
def __wrap(self, line):
if not line:
return '\n'
indent = self.current
if line.startswith('</'):
indent = self.current = indent[:-len(self.__each)]
elif line.startswith('<') and not line.startswith('<!'):
cut = line.find('>')
tag = (line[1:] if cut < 0 else line[1 : cut]).strip().split()[0]
if '</{}>'.format(tag) not in line:
self.current += self.__each
return indent + line + '\n'
def __call__(self, line):
return self.__call(line)
class QLocaleXmlWriter (object):
def __init__(self, save = None, space = Spacer(4)):
"""Set up to write digested CLDR data as QLocale XML.
Arguments are both optional.
First argument, save, is None (its default) or a callable that
will write content to where you intend to save it. If None, it
is replaced with a callable that prints the given content,
suppressing the newline (but see the following); this is
equivalent to passing sys.stdout.write.
Second argument, space, is an object to call on each text
output to prepend indentation and append newlines, or not as
the case may be. The default is a Spacer(4), which grows
indent by four spaces after each unmatched new tag and shrinks
back on a close-tag (its parsing is naive, but adequate to how
this class uses it), while adding a newline to each line.
"""
self.__rawOutput = self.__printit if save is None else save
self.__wrap = space
self.__write('<localeDatabase>')
# Output of various sections, in their usual order:
def enumData(self, languages, scripts, countries):
self.__enumTable('languageList', languages)
self.__enumTable('scriptList', scripts)
self.__enumTable('countryList', countries)
def likelySubTags(self, entries):
self.__openTag('likelySubtags')
for have, give in entries:
self.__openTag('likelySubtag')
self.__likelySubTag('from', have)
self.__likelySubTag('to', give)
self.__closeTag('likelySubtag')
self.__closeTag('likelySubtags')
def locales(self, locales, calendars):
self.__openTag('localeList')
self.__openTag('locale')
Locale.C(calendars).toXml(self.inTag, calendars)
self.__closeTag('locale')
keys = locales.keys()
keys.sort()
for key in keys:
self.__openTag('locale')
locales[key].toXml(self.inTag, calendars)
self.__closeTag('locale')
self.__closeTag('localeList')
def version(self, cldrVersion):
self.inTag('version', cldrVersion)
def inTag(self, tag, text):
self.__write('<{0}>{1}</{0}>'.format(tag, text))
def close(self):
if self.__rawOutput != self.__complain:
self.__write('</localeDatabase>')
self.__rawOutput = self.__complain
# Implementation details
@staticmethod
def __printit(text):
print(text, end='')
@staticmethod
def __complain(text):
raise Error('Attempted to write data after closing :-(')
def __enumTable(self, tag, table):
self.__openTag(tag)
for key, value in table.iteritems():
self.__openTag(tag[:-4])
self.inTag('name', value[0])
self.inTag('id', key)
self.inTag('code', value[1])
self.__closeTag(tag[:-4])
self.__closeTag(tag)
def __likelySubTag(self, tag, likely):
self.__openTag(tag)
self.inTag('language', likely[0])
self.inTag('script', likely[1])
self.inTag('country', likely[2])
# self.inTag('variant', likely[3])
self.__closeTag(tag)
def __openTag(self, tag):
self.__write('<{}>'.format(tag))
def __closeTag(self, tag):
self.__write('</{}>'.format(tag))
def __write(self, line):
self.__rawOutput(self.__wrap(line))
class Locale (object):
"""Holder for the assorted data representing one locale.
Implemented as a namespace; its constructor and update() have the
same signatures as those of a dict, acting on the instance's
__dict__, so the results are accessed as attributes rather than
mapping keys."""
def __init__(self, data=None, **kw):
self.update(data, **kw)
def update(self, data=None, **kw):
if data: self.__dict__.update(data)
if kw: self.__dict__.update(kw)
def __len__(self): # Used when testing as a boolean
return len(self.__dict__)
@staticmethod
def propsMonthDay(scale, lengths=('long', 'short', 'narrow')):
for L in lengths:
@ -176,19 +344,26 @@ class Locale:
return cls(data)
def toXml(self, calendars=('gregorian',), indent=' ', tab=' '):
print indent + '<locale>'
inner = indent + tab
def toXml(self, write, calendars=('gregorian',)):
"""Writes its data as QLocale XML.
First argument, write, is a callable taking the name and
content of an XML element; it is expected to be the inTag
bound method of a QLocaleXmlWriter instance.
Optional second argument is a list of calendar names, in the
form used by CLDR; its default is ('gregorian',).
"""
get = lambda k: getattr(self, k)
for key in ('language', 'script', 'country'):
print inner + "<%s>" % key + get(key) + "</%s>" % key
print inner + "<%scode>" % key + get(key + '_code') + "</%scode>" % key
write(key, get(key))
write('{}code'.format(key), get('{}_code'.format(key)))
for key in ('decimal', 'group', 'zero'):
print inner + "<%s>" % key + ordStr(get(key)) + "</%s>" % key
write(key, ordStr(get(key)))
for key, std in (('list', ';'), ('percent', '%'),
('minus', '-'), ('plus', '+'), ('exp', 'e')):
print inner + "<%s>" % key + fixOrdStr(get(key), std) + "</%s>" % key
write(key, fixOrdStr(get(key), std))
for key in ('languageEndonym', 'countryEndonym',
'quotationStart', 'quotationEnd',
@ -206,16 +381,10 @@ class Locale:
'_'.join((k, cal))
for k in self.propsMonthDay('months')
for cal in calendars):
print inner + "<%s>%s</%s>" % (key, escape(get(key)).encode('utf-8'), key)
write(key, escape(get(key)).encode('utf-8'))
for key in ('currencyDigits', 'currencyRounding'):
print inner + "<%s>%d</%s>" % (key, get(key), key)
print indent + "</locale>"
def __init__(self, data=None, **kw):
if data: self.__dict__.update(data)
if kw: self.__dict__.update(kw)
write(key, get(key))
# Tools used by __monthNames:
def fullName(i, name): return name
@ -261,8 +430,8 @@ class Locale:
for cal in calendars:
try:
data = known[cal]
except KeyError: # Need to add an entry to known, above.
print 'Unsupported calendar:', cal
except KeyError as e: # Need to add an entry to known, above.
e.args += ('Unsupported calendar:', cal)
raise
names, get = data[0] + ('',), data[1:]
for n, size in enumerate(sizes):
@ -279,7 +448,7 @@ class Locale:
'Thursday', 'Friday', 'Saturday', ''),
quantifiers=('k', 'M', 'G', 'T', 'P', 'E')):
"""Returns an object representing the C locale."""
return cls(dict(cls.__monthNames(calendars)),
return cls(cls.__monthNames(calendars),
language='C', language_code='0', languageEndonym='',
script='AnyScript', script_code='0',
country='AnyCountry', country_code='0', countryEndonym='',