Move cldr2qtimezone.py's CLDR-reading to a CldrAccess class
This begins the process of replacing xpathlite.py, adding low-level DOM-access classes to ldml.py and the CldrAccess class to cldr.py Moved a format comment from cldr2qtimezone.py's doc-string to the method of CldrAccess that does the actual reading. Task-number: QTBUG-81344 Change-Id: I46ae3f402f8207ced6d30a1de5cedaeef47b2bcf Reviewed-by: Cristian Maureira-Fredes <cristian.maureira-fredes@qt.io>
This commit is contained in:
parent
9fab53a513
commit
c834dbc6fb
182
util/locale_database/cldr.py
Normal file
182
util/locale_database/cldr.py
Normal file
@ -0,0 +1,182 @@
|
||||
#############################################################################
|
||||
##
|
||||
## Copyright (C) 2020 The Qt Company Ltd.
|
||||
## Contact: https://www.qt.io/licensing/
|
||||
##
|
||||
## This file is part of the test suite of the Qt Toolkit.
|
||||
##
|
||||
## $QT_BEGIN_LICENSE:GPL-EXCEPT$
|
||||
## Commercial License Usage
|
||||
## Licensees holding valid commercial Qt licenses may use this file in
|
||||
## accordance with the commercial license agreement provided with the
|
||||
## Software or, alternatively, in accordance with the terms contained in
|
||||
## a written agreement between you and The Qt Company. For licensing terms
|
||||
## and conditions see https://www.qt.io/terms-conditions. For further
|
||||
## information use the contact form at https://www.qt.io/contact-us.
|
||||
##
|
||||
## GNU General Public License Usage
|
||||
## Alternatively, this file may be used under the terms of the GNU
|
||||
## General Public License version 3 as published by the Free Software
|
||||
## Foundation with exceptions as appearing in the file LICENSE.GPL3-EXCEPT
|
||||
## included in the packaging of this file. Please review the following
|
||||
## information to ensure the GNU General Public License requirements will
|
||||
## be met: https://www.gnu.org/licenses/gpl-3.0.html.
|
||||
##
|
||||
## $QT_END_LICENSE$
|
||||
##
|
||||
#############################################################################
|
||||
"""Digesting the CLDR's data.
|
||||
|
||||
Provides two class:
|
||||
CldrAccess -- used by the reader to access the tree of data files
|
||||
|
||||
The former should normally be all you need to access.
|
||||
See individual classes for further detail.
|
||||
"""
|
||||
|
||||
from xml.dom import minidom
|
||||
from weakref import WeakValueDictionary as CacheDict
|
||||
import os
|
||||
|
||||
from localetools import Error
|
||||
from ldml import Node, Supplement
|
||||
|
||||
class CldrAccess (object):
|
||||
def __init__(self, root):
|
||||
"""Set up a master object for accessing CLDR data.
|
||||
|
||||
Single parameter, root, is the file-system path to the root of
|
||||
the unpacked CLDR archive; its common/ sub-directory should
|
||||
contain dtd/, main/ and supplemental/ sub-directories."""
|
||||
self.root = root
|
||||
|
||||
def supplement(self, name):
|
||||
"""Loads supplemental data as a Supplement object.
|
||||
|
||||
The name should be that of a file in common/supplemental/, without path.
|
||||
"""
|
||||
return Supplement(Node(self.__xml(('common', 'supplemental', name))))
|
||||
|
||||
def readWindowsTimeZones(self, lookup): # For use by cldr2qtimezone.py
|
||||
"""Digest CLDR's MS-Win time-zone name mapping.
|
||||
|
||||
MS-Win have their own eccentric names for time-zones. CLDR
|
||||
helpfully provides a translation to more orthodox names.
|
||||
|
||||
Singe argument, lookup, is a mapping from known MS-Win names
|
||||
for locales to a unique integer index (starting at 1).
|
||||
|
||||
The XML structure we read has the form:
|
||||
|
||||
<supplementalData>
|
||||
<windowsZones>
|
||||
<mapTimezones otherVersion="..." typeVersion="...">
|
||||
<!-- (UTC-08:00) Pacific Time (US & Canada) -->
|
||||
<mapZone other="Pacific Standard Time" territory="001" type="America/Los_Angeles"/>
|
||||
<mapZone other="Pacific Standard Time" territory="CA" type="America/Vancouver America/Dawson America/Whitehorse"/>
|
||||
<mapZone other="Pacific Standard Time" territory="US" type="America/Los_Angeles America/Metlakatla"/>
|
||||
<mapZone other="Pacific Standard Time" territory="ZZ" type="PST8PDT"/>
|
||||
</mapTimezones>
|
||||
</windowsZones>
|
||||
</supplementalData>
|
||||
"""
|
||||
zones = self.supplement('windowsZones.xml')
|
||||
enum = self.__enumMap('country')
|
||||
badZones, unLands, defaults, windows = set(), set(), {}, {}
|
||||
|
||||
for name, attrs in zones.find('windowsZones/mapTimezones'):
|
||||
if name != 'mapZone':
|
||||
continue
|
||||
|
||||
wid, code = attrs['other'], attrs['territory']
|
||||
data = dict(windowsId = wid,
|
||||
countryCode = code,
|
||||
ianaList = attrs['type'])
|
||||
|
||||
try:
|
||||
key = lookup[wid]
|
||||
except KeyError:
|
||||
badZones.add(wid)
|
||||
key = 0
|
||||
data['windowsKey'] = key
|
||||
|
||||
if code == u'001':
|
||||
defaults[key] = data['ianaList']
|
||||
else:
|
||||
try:
|
||||
cid, name = enum[code]
|
||||
except KeyError:
|
||||
unLands.append(code)
|
||||
continue
|
||||
data.update(countryId = cid, country = name)
|
||||
windows[key, cid] = data
|
||||
|
||||
if unLands:
|
||||
raise Error('Unknown country codes, please add to enumdata.py: '
|
||||
+ ', '.join(sorted(unLands)))
|
||||
|
||||
if badZones:
|
||||
raise Error('Unknown Windows IDs, please add to cldr2qtimezone.py: '
|
||||
+ ', '.join(sorted(badZones)))
|
||||
|
||||
return self.cldrVersion, defaults, windows
|
||||
|
||||
@property
|
||||
def cldrVersion(self):
|
||||
# Evaluate so as to ensure __cldrVersion is set:
|
||||
self.__scanLdmlDtd()
|
||||
return self.__cldrVersion
|
||||
|
||||
# Implementation details
|
||||
def __xml(self, path, cache = CacheDict(), read = minidom.parse, joinPath = os.path.join):
|
||||
try:
|
||||
doc = cache[path]
|
||||
except KeyError:
|
||||
cache[path] = doc = read(joinPath(self.root, *path)).documentElement
|
||||
return doc
|
||||
|
||||
def __open(self, path, joinPath=os.path.join):
|
||||
return open(joinPath(self.root, *path))
|
||||
|
||||
@property
|
||||
def __supplementalData(self, cache = []):
|
||||
if not cache:
|
||||
cache.append(self.supplement('supplementalData.xml'))
|
||||
return cache[0]
|
||||
|
||||
def __scanLdmlDtd(self, joinPath = os.path.join):
|
||||
"""Scan the LDML DTD, record CLDR version."""
|
||||
with self.__open(('common', 'dtd', 'ldml.dtd')) as dtd:
|
||||
for line in dtd:
|
||||
if line.startswith('<!ATTLIST '):
|
||||
parts = line.split()
|
||||
if parts[1:5] == ['version', 'cldrVersion', 'CDATA', '#FIXED']:
|
||||
# parts[5] is the version, in quotes, although the final > might be stuck on its end:
|
||||
self.__cldrVersion = parts[5].split('"')[1]
|
||||
break
|
||||
|
||||
def __enumMap(self, key, cache = {}):
|
||||
if not cache:
|
||||
cache['variant'] = {'': (0, 'This should never be seen outside ldml.py')}
|
||||
# They're not actually lists: mappings from numeric value
|
||||
# to pairs of full name and short code. What we want, in
|
||||
# each case, is a mapping from code to the other two.
|
||||
from enumdata import language_list, script_list, country_list
|
||||
for form, book, empty in (('language', language_list, 'AnyLanguage'),
|
||||
('script', script_list, 'AnyScript'),
|
||||
('country', country_list, 'AnyCountry')):
|
||||
cache[form] = dict((pair[1], (num, pair[0]))
|
||||
for num, pair in book.items() if pair[0] != 'C')
|
||||
# (Have to filter out the C locale, as we give it the
|
||||
# same (all space) code as AnyLanguage, whose code
|
||||
# should probably be 'und' instead.)
|
||||
|
||||
# Map empty to zero and the any value:
|
||||
cache[form][''] = (0, empty)
|
||||
# and map language code 'und' also to (0, any):
|
||||
cache['language']['und'] = (0, 'AnyLanguage')
|
||||
|
||||
return cache[key]
|
||||
|
||||
# Unpolute the namespace: we don't need to export these.
|
||||
del minidom, CacheDict, os
|
@ -34,32 +34,15 @@ the CLDR data. Pass its common/ directory as first parameter to this
|
||||
script and the qtbase root directory as second parameter. It shall
|
||||
update qtbase's src/corelib/time/qtimezoneprivate_data_p.h ready for
|
||||
use.
|
||||
|
||||
The XML structure we read has the form:
|
||||
|
||||
<supplementalData>
|
||||
<version number="$Revision:...$"/>
|
||||
<generation date="$Date:...$"/>
|
||||
<windowsZones>
|
||||
<mapTimezones otherVersion="..." typeVersion="...">
|
||||
<!-- (UTC-08:00) Pacific Time (US & Canada) -->
|
||||
<mapZone other="Pacific Standard Time" territory="001" type="America/Los_Angeles"/>
|
||||
<mapZone other="Pacific Standard Time" territory="CA" type="America/Vancouver America/Dawson America/Whitehorse"/>
|
||||
<mapZone other="Pacific Standard Time" territory="US" type="America/Los_Angeles America/Metlakatla"/>
|
||||
<mapZone other="Pacific Standard Time" territory="ZZ" type="PST8PDT"/>
|
||||
</mapTimezones>
|
||||
</windowsZones>
|
||||
</supplementalData>
|
||||
"""
|
||||
|
||||
import os
|
||||
import re
|
||||
import datetime
|
||||
import textwrap
|
||||
|
||||
import enumdata
|
||||
from localetools import unicode2hex, wrap_list, Error, SourceFileEditor
|
||||
from xpathlite import DraftResolution, findAlias, findEntry, findTagsInFile, \
|
||||
_findEntryInFile as findEntryInFile
|
||||
from cldr import CldrAccess
|
||||
|
||||
### Data that may need updates in response to new entries in the CLDR file ###
|
||||
|
||||
@ -351,10 +334,10 @@ def main(args, out, err):
|
||||
"""Parses CLDR's data and updates Qt's representation of it.
|
||||
|
||||
Takes sys.argv, sys.stdout, sys.stderr (or equivalents) as
|
||||
arguments. Expects two command-line options: the common/
|
||||
subdirectory of the unpacked CLDR data-file tree and the root of
|
||||
the qtbase module's checkout. Updates QTimeZone's private data
|
||||
about Windows time-zone IDs."""
|
||||
arguments. Expects two command-line options: the root of the
|
||||
unpacked CLDR data-file tree and the root of the qtbase module's
|
||||
checkout. Updates QTimeZone's private data about Windows time-zone
|
||||
IDs."""
|
||||
name = args.pop(0)
|
||||
if len(args) != 2:
|
||||
usage(err, name, "Expected two arguments")
|
||||
@ -375,54 +358,17 @@ def main(args, out, err):
|
||||
usage(err, name, 'No such file: ' + dataFilePath)
|
||||
return 1
|
||||
|
||||
windowsZonesPath = cldrPath + "/supplemental/windowsZones.xml"
|
||||
if not os.path.isfile(windowsZonesPath):
|
||||
usage(err, name, 'Failed to find CLDR data file: ' + windowsZonesPath)
|
||||
try:
|
||||
version, defaults, winIds = CldrAccess(cldrPath).readWindowsTimeZones(
|
||||
dict((name, ind) for ind, name in enumerate((x[0] for x in windowsIdList), 1)))
|
||||
except IOError as e:
|
||||
usage(err, name,
|
||||
'Failed to open common/supplemental/windowsZones.xml: ' + (e.message or e.args[1]))
|
||||
return 1
|
||||
|
||||
cldrVersion = 'unknown'
|
||||
ldml = open(cldrPath + "/dtd/ldml.dtd", "r")
|
||||
for line in ldml:
|
||||
if 'version cldrVersion CDATA #FIXED' in line:
|
||||
cldrVersion = line.split('"')[1]
|
||||
|
||||
mapTimezones = findTagsInFile(windowsZonesPath, "windowsZones/mapTimezones")
|
||||
if not mapTimezones:
|
||||
err.write('Failed to find time-zone data - aborting !\n')
|
||||
return 1
|
||||
|
||||
defaultDict, windowsIdDict = {}, {}
|
||||
badZones = set()
|
||||
winIdToIndex = dict((name, ind + 1) for ind, name in enumerate(x[0] for x in windowsIdList))
|
||||
for mapZone in mapTimezones:
|
||||
# [u'mapZone', [(u'territory', u'MH'), (u'other', u'UTC+12'), (u'type', u'Pacific/Majuro Pacific/Kwajalein')]]
|
||||
if mapZone[0] == u'mapZone':
|
||||
data = {}
|
||||
for attribute in mapZone[1]:
|
||||
if attribute[0] == u'other':
|
||||
data['windowsId'] = attribute[1]
|
||||
if attribute[0] == u'territory':
|
||||
data['countryCode'] = attribute[1]
|
||||
if attribute[0] == u'type':
|
||||
data['ianaList'] = attribute[1]
|
||||
|
||||
try:
|
||||
data['windowsKey'] = winIdToIndex[data['windowsId']]
|
||||
except KeyError:
|
||||
badZones.add(data['windowsId'])
|
||||
|
||||
countryId = 0
|
||||
if data['countryCode'] == u'001':
|
||||
defaultDict[data['windowsKey']] = data['ianaList']
|
||||
else:
|
||||
data['countryId'] = enumdata.countryCodeToId(data['countryCode'])
|
||||
if data['countryId'] < 0:
|
||||
raise Error('Unknown Country Code "{}"'.format(data['countryCode']))
|
||||
data['country'] = enumdata.country_list[data['countryId']][0]
|
||||
windowsIdDict[data['windowsKey'], data['countryId']] = data
|
||||
if badZones:
|
||||
err.write('\n\t'.join(["\nUnknown Windows ID, please add:"] + sorted(badZones))
|
||||
+ "\nto the windowsIdList in cldr2qtimezone.py\n\n")
|
||||
except Error as e:
|
||||
err.write('\n'.join(textwrap.wrap(
|
||||
'Failed to read windowsZones.xml: ' + (e.message or e.args[1]),
|
||||
subsequent_indent=' ', width=80)) + '\n')
|
||||
return 1
|
||||
|
||||
out.write('Input file parsed, now writing data\n')
|
||||
@ -433,7 +379,7 @@ def main(args, out, err):
|
||||
return 1
|
||||
|
||||
try:
|
||||
writer.write(cldrVersion, defaultDict, windowsIdDict)
|
||||
writer.write(version, defaults, winIds)
|
||||
except Error as e:
|
||||
writer.cleanup()
|
||||
err.write('\nError in Windows ID data: ' + e.message + '\n')
|
||||
|
140
util/locale_database/ldml.py
Normal file
140
util/locale_database/ldml.py
Normal file
@ -0,0 +1,140 @@
|
||||
#############################################################################
|
||||
##
|
||||
## Copyright (C) 2020 The Qt Company Ltd.
|
||||
## Contact: https://www.qt.io/licensing/
|
||||
##
|
||||
## This file is part of the test suite of the Qt Toolkit.
|
||||
##
|
||||
## $QT_BEGIN_LICENSE:GPL-EXCEPT$
|
||||
## Commercial License Usage
|
||||
## Licensees holding valid commercial Qt licenses may use this file in
|
||||
## accordance with the commercial license agreement provided with the
|
||||
## Software or, alternatively, in accordance with the terms contained in
|
||||
## a written agreement between you and The Qt Company. For licensing terms
|
||||
## and conditions see https://www.qt.io/terms-conditions. For further
|
||||
## information use the contact form at https://www.qt.io/contact-us.
|
||||
##
|
||||
## GNU General Public License Usage
|
||||
## Alternatively, this file may be used under the terms of the GNU
|
||||
## General Public License version 3 as published by the Free Software
|
||||
## Foundation with exceptions as appearing in the file LICENSE.GPL3-EXCEPT
|
||||
## included in the packaging of this file. Please review the following
|
||||
## information to ensure the GNU General Public License requirements will
|
||||
## be met: https://www.gnu.org/licenses/gpl-3.0.html.
|
||||
##
|
||||
## $QT_END_LICENSE$
|
||||
##
|
||||
#############################################################################
|
||||
"""Parsing the Locale Data Markup Language
|
||||
|
||||
It's an XML format, so the raw parsing of XML is, of course, delegated
|
||||
to xml.dom.minidom; but it has its own specific schemata and some
|
||||
funky rules for combining data from various files (inheritance between
|
||||
locales). The use of it we're interested in is extraction of CLDR's
|
||||
data, so some of the material here is specific to CLDR; see cldr.py
|
||||
for how it is mainly used.
|
||||
|
||||
Provides various classes to wrap xml.dom's objects, specifically those
|
||||
returned by minidom.parse() and their child-nodes:
|
||||
Node -- wraps any node in the DOM tree
|
||||
XmlScanner -- wraps the root element of a stand-alone XML file
|
||||
Supplement -- specializes XmlScanner for supplemental data files
|
||||
|
||||
See individual classes for further detail.
|
||||
"""
|
||||
from localetools import Error
|
||||
|
||||
class Node (object):
|
||||
"""Wrapper for an arbitrary DOM node.
|
||||
|
||||
Provides various ways to select chldren of a node. Selected child
|
||||
nodes are returned wrapped as Node objects. A Node exposes the
|
||||
raw DOM node it wraps via its .dom attribute."""
|
||||
|
||||
def __init__(self, elt):
|
||||
"""Wraps a DOM node for ease of access.
|
||||
|
||||
Single argument, elt, is the DOM node to wrap."""
|
||||
self.dom = elt
|
||||
|
||||
def findAllChildren(self, tag, wanted = None):
|
||||
"""All children that do have the given tag and attributes.
|
||||
|
||||
First argument is the tag: children with any other tag are
|
||||
ignored.
|
||||
|
||||
Optional second argument, wanted, should either be None or map
|
||||
attribute names to the values they must have. Only child nodes
|
||||
with these attributes set to the given values are yielded."""
|
||||
|
||||
cutoff = 4 # Only accept approved, for now
|
||||
for child in self.dom.childNodes:
|
||||
if child.nodeType != child.ELEMENT_NODE:
|
||||
continue
|
||||
if child.nodeName != tag:
|
||||
continue
|
||||
|
||||
try:
|
||||
draft = child.attributes['draft']
|
||||
except KeyError:
|
||||
pass
|
||||
else:
|
||||
if self.__draftScores.get(draft, 0) < cutoff:
|
||||
continue
|
||||
|
||||
if wanted is not None:
|
||||
try:
|
||||
if wanted and any(child.attributes[k].nodeValue != v for k, v in wanted.items()):
|
||||
continue
|
||||
except KeyError: # Some wanted attribute is missing
|
||||
continue
|
||||
|
||||
yield Node(child)
|
||||
|
||||
__draftScores = dict(true = 0, unconfirmed = 1, provisional = 2,
|
||||
contributed = 3, approved = 4, false = 4)
|
||||
|
||||
def _parseXPath(selector):
|
||||
# Split "tag[attr=val][...]" into tag-name and attribute mapping
|
||||
attrs = selector.split('[')
|
||||
name = attrs.pop(0)
|
||||
if attrs:
|
||||
attrs = [x.strip() for x in attrs]
|
||||
assert all(x.endswith(']') for x in attrs)
|
||||
attrs = [x[:-1].split('=') for x in attrs]
|
||||
assert all(len(x) in (1, 2) for x in attrs)
|
||||
attrs = (('type', x[0]) if len(x) == 1 else x for x in attrs)
|
||||
return name, dict(attrs)
|
||||
|
||||
def _iterateEach(iters):
|
||||
# Flatten a two-layer iterator.
|
||||
for it in iters:
|
||||
for item in it:
|
||||
yield item
|
||||
|
||||
class XmlScanner (object):
|
||||
"""Wrap an XML file to enable XPath access to its nodes.
|
||||
"""
|
||||
def __init__(self, node):
|
||||
self.root = node
|
||||
|
||||
def findNodes(self, xpath):
|
||||
"""Return all nodes under self.root matching this xpath"""
|
||||
elts = (self.root,)
|
||||
for selector in xpath.split('/'):
|
||||
tag, attrs = _parseXPath(selector)
|
||||
elts = tuple(_iterateEach(e.findAllChildren(tag, attrs) for e in elts))
|
||||
if not elts:
|
||||
break
|
||||
return elts
|
||||
|
||||
class Supplement (XmlScanner):
|
||||
# Replaces xpathlite.findTagsInFile()
|
||||
def find(self, xpath):
|
||||
elts = self.findNodes(xpath)
|
||||
for elt in _iterateEach(e.dom.childNodes if e.dom.childNodes else (e.dom,)
|
||||
for e in elts):
|
||||
if elt.attributes:
|
||||
yield (elt.nodeName,
|
||||
dict((k, v if isinstance(v, basestring) else v.nodeValue)
|
||||
for k, v in elt.attributes.items()))
|
Loading…
x
Reference in New Issue
Block a user