Move cldr2qtimezone.py's CLDR-reading to a CldrAccess class

This begins the process of replacing xpathlite.py, adding low-level
DOM-access classes to ldml.py and the CldrAccess class to cldr.py

Moved a format comment from cldr2qtimezone.py's doc-string to the
method of CldrAccess that does the actual reading.

Task-number: QTBUG-81344
Change-Id: I46ae3f402f8207ced6d30a1de5cedaeef47b2bcf
Reviewed-by: Cristian Maureira-Fredes <cristian.maureira-fredes@qt.io>
This commit is contained in:
Edward Welbourne 2020-02-27 10:56:36 +01:00 committed by Edward Welbourne
parent 9fab53a513
commit c834dbc6fb
3 changed files with 339 additions and 71 deletions

View File

@ -0,0 +1,182 @@
#############################################################################
##
## Copyright (C) 2020 The Qt Company Ltd.
## Contact: https://www.qt.io/licensing/
##
## This file is part of the test suite of the Qt Toolkit.
##
## $QT_BEGIN_LICENSE:GPL-EXCEPT$
## Commercial License Usage
## Licensees holding valid commercial Qt licenses may use this file in
## accordance with the commercial license agreement provided with the
## Software or, alternatively, in accordance with the terms contained in
## a written agreement between you and The Qt Company. For licensing terms
## and conditions see https://www.qt.io/terms-conditions. For further
## information use the contact form at https://www.qt.io/contact-us.
##
## GNU General Public License Usage
## Alternatively, this file may be used under the terms of the GNU
## General Public License version 3 as published by the Free Software
## Foundation with exceptions as appearing in the file LICENSE.GPL3-EXCEPT
## included in the packaging of this file. Please review the following
## information to ensure the GNU General Public License requirements will
## be met: https://www.gnu.org/licenses/gpl-3.0.html.
##
## $QT_END_LICENSE$
##
#############################################################################
"""Digesting the CLDR's data.
Provides two class:
CldrAccess -- used by the reader to access the tree of data files
The former should normally be all you need to access.
See individual classes for further detail.
"""
from xml.dom import minidom
from weakref import WeakValueDictionary as CacheDict
import os
from localetools import Error
from ldml import Node, Supplement
class CldrAccess (object):
def __init__(self, root):
"""Set up a master object for accessing CLDR data.
Single parameter, root, is the file-system path to the root of
the unpacked CLDR archive; its common/ sub-directory should
contain dtd/, main/ and supplemental/ sub-directories."""
self.root = root
def supplement(self, name):
"""Loads supplemental data as a Supplement object.
The name should be that of a file in common/supplemental/, without path.
"""
return Supplement(Node(self.__xml(('common', 'supplemental', name))))
def readWindowsTimeZones(self, lookup): # For use by cldr2qtimezone.py
"""Digest CLDR's MS-Win time-zone name mapping.
MS-Win have their own eccentric names for time-zones. CLDR
helpfully provides a translation to more orthodox names.
Singe argument, lookup, is a mapping from known MS-Win names
for locales to a unique integer index (starting at 1).
The XML structure we read has the form:
<supplementalData>
<windowsZones>
<mapTimezones otherVersion="..." typeVersion="...">
<!-- (UTC-08:00) Pacific Time (US & Canada) -->
<mapZone other="Pacific Standard Time" territory="001" type="America/Los_Angeles"/>
<mapZone other="Pacific Standard Time" territory="CA" type="America/Vancouver America/Dawson America/Whitehorse"/>
<mapZone other="Pacific Standard Time" territory="US" type="America/Los_Angeles America/Metlakatla"/>
<mapZone other="Pacific Standard Time" territory="ZZ" type="PST8PDT"/>
</mapTimezones>
</windowsZones>
</supplementalData>
"""
zones = self.supplement('windowsZones.xml')
enum = self.__enumMap('country')
badZones, unLands, defaults, windows = set(), set(), {}, {}
for name, attrs in zones.find('windowsZones/mapTimezones'):
if name != 'mapZone':
continue
wid, code = attrs['other'], attrs['territory']
data = dict(windowsId = wid,
countryCode = code,
ianaList = attrs['type'])
try:
key = lookup[wid]
except KeyError:
badZones.add(wid)
key = 0
data['windowsKey'] = key
if code == u'001':
defaults[key] = data['ianaList']
else:
try:
cid, name = enum[code]
except KeyError:
unLands.append(code)
continue
data.update(countryId = cid, country = name)
windows[key, cid] = data
if unLands:
raise Error('Unknown country codes, please add to enumdata.py: '
+ ', '.join(sorted(unLands)))
if badZones:
raise Error('Unknown Windows IDs, please add to cldr2qtimezone.py: '
+ ', '.join(sorted(badZones)))
return self.cldrVersion, defaults, windows
@property
def cldrVersion(self):
# Evaluate so as to ensure __cldrVersion is set:
self.__scanLdmlDtd()
return self.__cldrVersion
# Implementation details
def __xml(self, path, cache = CacheDict(), read = minidom.parse, joinPath = os.path.join):
try:
doc = cache[path]
except KeyError:
cache[path] = doc = read(joinPath(self.root, *path)).documentElement
return doc
def __open(self, path, joinPath=os.path.join):
return open(joinPath(self.root, *path))
@property
def __supplementalData(self, cache = []):
if not cache:
cache.append(self.supplement('supplementalData.xml'))
return cache[0]
def __scanLdmlDtd(self, joinPath = os.path.join):
"""Scan the LDML DTD, record CLDR version."""
with self.__open(('common', 'dtd', 'ldml.dtd')) as dtd:
for line in dtd:
if line.startswith('<!ATTLIST '):
parts = line.split()
if parts[1:5] == ['version', 'cldrVersion', 'CDATA', '#FIXED']:
# parts[5] is the version, in quotes, although the final > might be stuck on its end:
self.__cldrVersion = parts[5].split('"')[1]
break
def __enumMap(self, key, cache = {}):
if not cache:
cache['variant'] = {'': (0, 'This should never be seen outside ldml.py')}
# They're not actually lists: mappings from numeric value
# to pairs of full name and short code. What we want, in
# each case, is a mapping from code to the other two.
from enumdata import language_list, script_list, country_list
for form, book, empty in (('language', language_list, 'AnyLanguage'),
('script', script_list, 'AnyScript'),
('country', country_list, 'AnyCountry')):
cache[form] = dict((pair[1], (num, pair[0]))
for num, pair in book.items() if pair[0] != 'C')
# (Have to filter out the C locale, as we give it the
# same (all space) code as AnyLanguage, whose code
# should probably be 'und' instead.)
# Map empty to zero and the any value:
cache[form][''] = (0, empty)
# and map language code 'und' also to (0, any):
cache['language']['und'] = (0, 'AnyLanguage')
return cache[key]
# Unpolute the namespace: we don't need to export these.
del minidom, CacheDict, os

View File

@ -34,32 +34,15 @@ the CLDR data. Pass its common/ directory as first parameter to this
script and the qtbase root directory as second parameter. It shall
update qtbase's src/corelib/time/qtimezoneprivate_data_p.h ready for
use.
The XML structure we read has the form:
<supplementalData>
<version number="$Revision:...$"/>
<generation date="$Date:...$"/>
<windowsZones>
<mapTimezones otherVersion="..." typeVersion="...">
<!-- (UTC-08:00) Pacific Time (US & Canada) -->
<mapZone other="Pacific Standard Time" territory="001" type="America/Los_Angeles"/>
<mapZone other="Pacific Standard Time" territory="CA" type="America/Vancouver America/Dawson America/Whitehorse"/>
<mapZone other="Pacific Standard Time" territory="US" type="America/Los_Angeles America/Metlakatla"/>
<mapZone other="Pacific Standard Time" territory="ZZ" type="PST8PDT"/>
</mapTimezones>
</windowsZones>
</supplementalData>
"""
import os
import re
import datetime
import textwrap
import enumdata
from localetools import unicode2hex, wrap_list, Error, SourceFileEditor
from xpathlite import DraftResolution, findAlias, findEntry, findTagsInFile, \
_findEntryInFile as findEntryInFile
from cldr import CldrAccess
### Data that may need updates in response to new entries in the CLDR file ###
@ -351,10 +334,10 @@ def main(args, out, err):
"""Parses CLDR's data and updates Qt's representation of it.
Takes sys.argv, sys.stdout, sys.stderr (or equivalents) as
arguments. Expects two command-line options: the common/
subdirectory of the unpacked CLDR data-file tree and the root of
the qtbase module's checkout. Updates QTimeZone's private data
about Windows time-zone IDs."""
arguments. Expects two command-line options: the root of the
unpacked CLDR data-file tree and the root of the qtbase module's
checkout. Updates QTimeZone's private data about Windows time-zone
IDs."""
name = args.pop(0)
if len(args) != 2:
usage(err, name, "Expected two arguments")
@ -375,54 +358,17 @@ def main(args, out, err):
usage(err, name, 'No such file: ' + dataFilePath)
return 1
windowsZonesPath = cldrPath + "/supplemental/windowsZones.xml"
if not os.path.isfile(windowsZonesPath):
usage(err, name, 'Failed to find CLDR data file: ' + windowsZonesPath)
try:
version, defaults, winIds = CldrAccess(cldrPath).readWindowsTimeZones(
dict((name, ind) for ind, name in enumerate((x[0] for x in windowsIdList), 1)))
except IOError as e:
usage(err, name,
'Failed to open common/supplemental/windowsZones.xml: ' + (e.message or e.args[1]))
return 1
cldrVersion = 'unknown'
ldml = open(cldrPath + "/dtd/ldml.dtd", "r")
for line in ldml:
if 'version cldrVersion CDATA #FIXED' in line:
cldrVersion = line.split('"')[1]
mapTimezones = findTagsInFile(windowsZonesPath, "windowsZones/mapTimezones")
if not mapTimezones:
err.write('Failed to find time-zone data - aborting !\n')
return 1
defaultDict, windowsIdDict = {}, {}
badZones = set()
winIdToIndex = dict((name, ind + 1) for ind, name in enumerate(x[0] for x in windowsIdList))
for mapZone in mapTimezones:
# [u'mapZone', [(u'territory', u'MH'), (u'other', u'UTC+12'), (u'type', u'Pacific/Majuro Pacific/Kwajalein')]]
if mapZone[0] == u'mapZone':
data = {}
for attribute in mapZone[1]:
if attribute[0] == u'other':
data['windowsId'] = attribute[1]
if attribute[0] == u'territory':
data['countryCode'] = attribute[1]
if attribute[0] == u'type':
data['ianaList'] = attribute[1]
try:
data['windowsKey'] = winIdToIndex[data['windowsId']]
except KeyError:
badZones.add(data['windowsId'])
countryId = 0
if data['countryCode'] == u'001':
defaultDict[data['windowsKey']] = data['ianaList']
else:
data['countryId'] = enumdata.countryCodeToId(data['countryCode'])
if data['countryId'] < 0:
raise Error('Unknown Country Code "{}"'.format(data['countryCode']))
data['country'] = enumdata.country_list[data['countryId']][0]
windowsIdDict[data['windowsKey'], data['countryId']] = data
if badZones:
err.write('\n\t'.join(["\nUnknown Windows ID, please add:"] + sorted(badZones))
+ "\nto the windowsIdList in cldr2qtimezone.py\n\n")
except Error as e:
err.write('\n'.join(textwrap.wrap(
'Failed to read windowsZones.xml: ' + (e.message or e.args[1]),
subsequent_indent=' ', width=80)) + '\n')
return 1
out.write('Input file parsed, now writing data\n')
@ -433,7 +379,7 @@ def main(args, out, err):
return 1
try:
writer.write(cldrVersion, defaultDict, windowsIdDict)
writer.write(version, defaults, winIds)
except Error as e:
writer.cleanup()
err.write('\nError in Windows ID data: ' + e.message + '\n')

View File

@ -0,0 +1,140 @@
#############################################################################
##
## Copyright (C) 2020 The Qt Company Ltd.
## Contact: https://www.qt.io/licensing/
##
## This file is part of the test suite of the Qt Toolkit.
##
## $QT_BEGIN_LICENSE:GPL-EXCEPT$
## Commercial License Usage
## Licensees holding valid commercial Qt licenses may use this file in
## accordance with the commercial license agreement provided with the
## Software or, alternatively, in accordance with the terms contained in
## a written agreement between you and The Qt Company. For licensing terms
## and conditions see https://www.qt.io/terms-conditions. For further
## information use the contact form at https://www.qt.io/contact-us.
##
## GNU General Public License Usage
## Alternatively, this file may be used under the terms of the GNU
## General Public License version 3 as published by the Free Software
## Foundation with exceptions as appearing in the file LICENSE.GPL3-EXCEPT
## included in the packaging of this file. Please review the following
## information to ensure the GNU General Public License requirements will
## be met: https://www.gnu.org/licenses/gpl-3.0.html.
##
## $QT_END_LICENSE$
##
#############################################################################
"""Parsing the Locale Data Markup Language
It's an XML format, so the raw parsing of XML is, of course, delegated
to xml.dom.minidom; but it has its own specific schemata and some
funky rules for combining data from various files (inheritance between
locales). The use of it we're interested in is extraction of CLDR's
data, so some of the material here is specific to CLDR; see cldr.py
for how it is mainly used.
Provides various classes to wrap xml.dom's objects, specifically those
returned by minidom.parse() and their child-nodes:
Node -- wraps any node in the DOM tree
XmlScanner -- wraps the root element of a stand-alone XML file
Supplement -- specializes XmlScanner for supplemental data files
See individual classes for further detail.
"""
from localetools import Error
class Node (object):
"""Wrapper for an arbitrary DOM node.
Provides various ways to select chldren of a node. Selected child
nodes are returned wrapped as Node objects. A Node exposes the
raw DOM node it wraps via its .dom attribute."""
def __init__(self, elt):
"""Wraps a DOM node for ease of access.
Single argument, elt, is the DOM node to wrap."""
self.dom = elt
def findAllChildren(self, tag, wanted = None):
"""All children that do have the given tag and attributes.
First argument is the tag: children with any other tag are
ignored.
Optional second argument, wanted, should either be None or map
attribute names to the values they must have. Only child nodes
with these attributes set to the given values are yielded."""
cutoff = 4 # Only accept approved, for now
for child in self.dom.childNodes:
if child.nodeType != child.ELEMENT_NODE:
continue
if child.nodeName != tag:
continue
try:
draft = child.attributes['draft']
except KeyError:
pass
else:
if self.__draftScores.get(draft, 0) < cutoff:
continue
if wanted is not None:
try:
if wanted and any(child.attributes[k].nodeValue != v for k, v in wanted.items()):
continue
except KeyError: # Some wanted attribute is missing
continue
yield Node(child)
__draftScores = dict(true = 0, unconfirmed = 1, provisional = 2,
contributed = 3, approved = 4, false = 4)
def _parseXPath(selector):
# Split "tag[attr=val][...]" into tag-name and attribute mapping
attrs = selector.split('[')
name = attrs.pop(0)
if attrs:
attrs = [x.strip() for x in attrs]
assert all(x.endswith(']') for x in attrs)
attrs = [x[:-1].split('=') for x in attrs]
assert all(len(x) in (1, 2) for x in attrs)
attrs = (('type', x[0]) if len(x) == 1 else x for x in attrs)
return name, dict(attrs)
def _iterateEach(iters):
# Flatten a two-layer iterator.
for it in iters:
for item in it:
yield item
class XmlScanner (object):
"""Wrap an XML file to enable XPath access to its nodes.
"""
def __init__(self, node):
self.root = node
def findNodes(self, xpath):
"""Return all nodes under self.root matching this xpath"""
elts = (self.root,)
for selector in xpath.split('/'):
tag, attrs = _parseXPath(selector)
elts = tuple(_iterateEach(e.findAllChildren(tag, attrs) for e in elts))
if not elts:
break
return elts
class Supplement (XmlScanner):
# Replaces xpathlite.findTagsInFile()
def find(self, xpath):
elts = self.findNodes(xpath)
for elt in _iterateEach(e.dom.childNodes if e.dom.childNodes else (e.dom,)
for e in elts):
if elt.attributes:
yield (elt.nodeName,
dict((k, v if isinstance(v, basestring) else v.nodeValue)
for k, v in elt.attributes.items()))