Move cldr2qtimezone.py's CLDR-reading to a CldrAccess class

This begins the process of replacing xpathlite.py, adding low-level DOM-access classes to ldml.py and the CldrAccess class to cldr.py Moved a format comment from cldr2qtimezone.py's doc-string to the method of CldrAccess that does the actual reading. Task-number: QTBUG-81344 Change-Id: I46ae3f402f8207ced6d30a1de5cedaeef47b2bcf Reviewed-by: Cristian Maureira-Fredes <cristian.maureira-fredes@qt.io>
2020-02-27 10:56:36 +01:00 · 2020-02-27 10:56:36 +01:00 · c834dbc6fb
commit c834dbc6fb
parent 9fab53a513
3 changed files with 339 additions and 71 deletions
--- a/util/locale_database/cldr.py
+++ b/util/locale_database/cldr.py
@ -0,0 +1,182 @@
+#############################################################################
+##
+## Copyright (C) 2020 The Qt Company Ltd.
+## Contact: https://www.qt.io/licensing/
+##
+## This file is part of the test suite of the Qt Toolkit.
+##
+## $QT_BEGIN_LICENSE:GPL-EXCEPT$
+## Commercial License Usage
+## Licensees holding valid commercial Qt licenses may use this file in
+## accordance with the commercial license agreement provided with the
+## Software or, alternatively, in accordance with the terms contained in
+## a written agreement between you and The Qt Company. For licensing terms
+## and conditions see https://www.qt.io/terms-conditions. For further
+## information use the contact form at https://www.qt.io/contact-us.
+##
+## GNU General Public License Usage
+## Alternatively, this file may be used under the terms of the GNU
+## General Public License version 3 as published by the Free Software
+## Foundation with exceptions as appearing in the file LICENSE.GPL3-EXCEPT
+## included in the packaging of this file. Please review the following
+## information to ensure the GNU General Public License requirements will
+## be met: https://www.gnu.org/licenses/gpl-3.0.html.
+##
+## $QT_END_LICENSE$
+##
+#############################################################################
+"""Digesting the CLDR's data.
+
+Provides two class:
+  CldrAccess -- used by the reader to access the tree of data files
+
+The former should normally be all you need to access.
+See individual classes for further detail.
+"""
+
+from xml.dom import minidom
+from weakref import WeakValueDictionary as CacheDict
+import os
+
+from localetools import Error
+from ldml import Node, Supplement
+
+class CldrAccess (object):
+    def __init__(self, root):
+        """Set up a master object for accessing CLDR data.
+
+        Single parameter, root, is the file-system path to the root of
+        the unpacked CLDR archive; its common/ sub-directory should
+        contain dtd/, main/ and supplemental/ sub-directories."""
+        self.root = root
+
+    def supplement(self, name):
+        """Loads supplemental data as a Supplement object.
+
+        The name should be that of a file in common/supplemental/, without path.
+        """
+        return Supplement(Node(self.__xml(('common', 'supplemental', name))))
+
+    def readWindowsTimeZones(self, lookup): # For use by cldr2qtimezone.py
+        """Digest CLDR's MS-Win time-zone name mapping.
+
+        MS-Win have their own eccentric names for time-zones.  CLDR
+        helpfully provides a translation to more orthodox names.
+
+        Singe argument, lookup, is a mapping from known MS-Win names
+        for locales to a unique integer index (starting at 1).
+
+        The XML structure we read has the form:
+
+ <supplementalData>
+     <windowsZones>
+         <mapTimezones otherVersion="..." typeVersion="...">
+             <!-- (UTC-08:00) Pacific Time (US & Canada) -->
+             <mapZone other="Pacific Standard Time" territory="001" type="America/Los_Angeles"/>
+             <mapZone other="Pacific Standard Time" territory="CA" type="America/Vancouver America/Dawson America/Whitehorse"/>
+             <mapZone other="Pacific Standard Time" territory="US" type="America/Los_Angeles America/Metlakatla"/>
+             <mapZone other="Pacific Standard Time" territory="ZZ" type="PST8PDT"/>
+         </mapTimezones>
+     </windowsZones>
+ </supplementalData>
+"""
+        zones = self.supplement('windowsZones.xml')
+        enum = self.__enumMap('country')
+        badZones, unLands, defaults, windows = set(), set(), {}, {}
+
+        for name, attrs in zones.find('windowsZones/mapTimezones'):
+            if name != 'mapZone':
+                continue
+
+            wid, code = attrs['other'], attrs['territory']
+            data = dict(windowsId = wid,
+                        countryCode = code,
+                        ianaList = attrs['type'])
+
+            try:
+                key = lookup[wid]
+            except KeyError:
+                badZones.add(wid)
+                key = 0
+            data['windowsKey'] = key
+
+            if code == u'001':
+                defaults[key] = data['ianaList']
+            else:
+                try:
+                    cid, name = enum[code]
+                except KeyError:
+                    unLands.append(code)
+                    continue
+                data.update(countryId = cid, country = name)
+                windows[key, cid] = data
+
+        if unLands:
+            raise Error('Unknown country codes, please add to enumdata.py: '
+                        + ', '.join(sorted(unLands)))
+
+        if badZones:
+            raise Error('Unknown Windows IDs, please add to cldr2qtimezone.py: '
+                        + ', '.join(sorted(badZones)))
+
+        return self.cldrVersion, defaults, windows
+
+    @property
+    def cldrVersion(self):
+        # Evaluate so as to ensure __cldrVersion is set:
+        self.__scanLdmlDtd()
+        return self.__cldrVersion
+
+    # Implementation details
+    def __xml(self, path, cache = CacheDict(), read = minidom.parse, joinPath = os.path.join):
+        try:
+            doc = cache[path]
+        except KeyError:
+            cache[path] = doc = read(joinPath(self.root, *path)).documentElement
+        return doc
+
+    def __open(self, path, joinPath=os.path.join):
+        return open(joinPath(self.root, *path))
+
+    @property
+    def __supplementalData(self, cache = []):
+        if not cache:
+            cache.append(self.supplement('supplementalData.xml'))
+        return cache[0]
+
+    def __scanLdmlDtd(self, joinPath = os.path.join):
+        """Scan the LDML DTD, record CLDR version."""
+        with self.__open(('common', 'dtd', 'ldml.dtd')) as dtd:
+            for line in dtd:
+                if line.startswith('<!ATTLIST '):
+                    parts = line.split()
+                    if parts[1:5] == ['version', 'cldrVersion', 'CDATA', '#FIXED']:
+                        # parts[5] is the version, in quotes, although the final > might be stuck on its end:
+                        self.__cldrVersion = parts[5].split('"')[1]
+                        break
+
+    def __enumMap(self, key, cache = {}):
+        if not cache:
+            cache['variant'] = {'': (0, 'This should never be seen outside ldml.py')}
+            # They're not actually lists: mappings from numeric value
+            # to pairs of full name and short code. What we want, in
+            # each case, is a mapping from code to the other two.
+            from enumdata import language_list, script_list, country_list
+            for form, book, empty in (('language', language_list, 'AnyLanguage'),
+                                      ('script', script_list, 'AnyScript'),
+                                      ('country', country_list, 'AnyCountry')):
+                cache[form] = dict((pair[1], (num, pair[0]))
+                                   for num, pair in book.items() if pair[0] != 'C')
+                # (Have to filter out the C locale, as we give it the
+                # same (all space) code as AnyLanguage, whose code
+                # should probably be 'und' instead.)
+
+                # Map empty to zero and the any value:
+                cache[form][''] = (0, empty)
+            # and map language code 'und' also to (0, any):
+            cache['language']['und'] = (0, 'AnyLanguage')
+
+        return cache[key]
+
+# Unpolute the namespace: we don't need to export these.
+del minidom, CacheDict, os
--- a/util/locale_database/cldr2qtimezone.py
+++ b/util/locale_database/cldr2qtimezone.py
@ -34,32 +34,15 @@ the CLDR data.  Pass its common/ directory as first parameter to this
 script and the qtbase root directory as second parameter.  It shall
 update qtbase's src/corelib/time/qtimezoneprivate_data_p.h ready for
 use.
-
-The XML structure we read has the form:
-
- <supplementalData>
-     <version number="$Revision:...$"/>
-     <generation date="$Date:...$"/>
-     <windowsZones>
-         <mapTimezones otherVersion="..." typeVersion="...">
-             <!-- (UTC-08:00) Pacific Time (US & Canada) -->
-             <mapZone other="Pacific Standard Time" territory="001" type="America/Los_Angeles"/>
-             <mapZone other="Pacific Standard Time" territory="CA" type="America/Vancouver America/Dawson America/Whitehorse"/>
-             <mapZone other="Pacific Standard Time" territory="US" type="America/Los_Angeles America/Metlakatla"/>
-             <mapZone other="Pacific Standard Time" territory="ZZ" type="PST8PDT"/>
-         </mapTimezones>
-     </windowsZones>
- </supplementalData>
 """

 import os
 import re
 import datetime
+import textwrap

-import enumdata
 from localetools import unicode2hex, wrap_list, Error, SourceFileEditor
-from xpathlite import DraftResolution, findAlias, findEntry, findTagsInFile, \
-    _findEntryInFile as findEntryInFile
+from cldr import CldrAccess

 ### Data that may need updates in response to new entries in the CLDR file ###

@ -351,10 +334,10 @@ def main(args, out, err):
    """Parses CLDR's data and updates Qt's representation of it.

    Takes sys.argv, sys.stdout, sys.stderr (or equivalents) as
-    arguments. Expects two command-line options: the common/
-    subdirectory of the unpacked CLDR data-file tree and the root of
-    the qtbase module's checkout. Updates QTimeZone's private data
-    about Windows time-zone IDs."""
+    arguments. Expects two command-line options: the root of the
+    unpacked CLDR data-file tree and the root of the qtbase module's
+    checkout. Updates QTimeZone's private data about Windows time-zone
+    IDs."""
    name = args.pop(0)
    if len(args) != 2:
        usage(err, name, "Expected two arguments")
@ -375,54 +358,17 @@ def main(args, out, err):
        usage(err, name, 'No such file: ' + dataFilePath)
        return 1

-    windowsZonesPath = cldrPath + "/supplemental/windowsZones.xml"
-    if not os.path.isfile(windowsZonesPath):
-        usage(err, name, 'Failed to find CLDR data file: ' + windowsZonesPath)
+    try:
+        version, defaults, winIds = CldrAccess(cldrPath).readWindowsTimeZones(
+            dict((name, ind) for ind, name in enumerate((x[0] for x in windowsIdList), 1)))
+    except IOError as e:
+        usage(err, name,
+              'Failed to open common/supplemental/windowsZones.xml: ' + (e.message or e.args[1]))
        return 1
-
-    cldrVersion = 'unknown'
-    ldml = open(cldrPath + "/dtd/ldml.dtd", "r")
-    for line in ldml:
-        if 'version cldrVersion CDATA #FIXED' in line:
-            cldrVersion = line.split('"')[1]
-
-    mapTimezones = findTagsInFile(windowsZonesPath, "windowsZones/mapTimezones")
-    if not mapTimezones:
-        err.write('Failed to find time-zone data - aborting !\n')
-        return 1
-
-    defaultDict, windowsIdDict = {}, {}
-    badZones = set()
-    winIdToIndex = dict((name, ind + 1) for ind, name in enumerate(x[0] for x in windowsIdList))
-    for mapZone in mapTimezones:
-        # [u'mapZone', [(u'territory', u'MH'), (u'other', u'UTC+12'), (u'type', u'Pacific/Majuro Pacific/Kwajalein')]]
-        if mapZone[0] == u'mapZone':
-            data = {}
-            for attribute in mapZone[1]:
-                if attribute[0] == u'other':
-                    data['windowsId'] = attribute[1]
-                if attribute[0] == u'territory':
-                    data['countryCode'] = attribute[1]
-                if attribute[0] == u'type':
-                    data['ianaList'] = attribute[1]
-
-            try:
-                data['windowsKey'] = winIdToIndex[data['windowsId']]
-            except KeyError:
-                badZones.add(data['windowsId'])
-
-            countryId = 0
-            if data['countryCode'] == u'001':
-                defaultDict[data['windowsKey']] = data['ianaList']
-            else:
-                data['countryId'] = enumdata.countryCodeToId(data['countryCode'])
-                if data['countryId'] < 0:
-                    raise Error('Unknown Country Code "{}"'.format(data['countryCode']))
-                data['country'] = enumdata.country_list[data['countryId']][0]
-                windowsIdDict[data['windowsKey'], data['countryId']] = data
-    if badZones:
-        err.write('\n\t'.join(["\nUnknown Windows ID, please add:"] + sorted(badZones))
-                  + "\nto the windowsIdList in cldr2qtimezone.py\n\n")
+    except Error as e:
+        err.write('\n'.join(textwrap.wrap(
+                    'Failed to read windowsZones.xml: ' + (e.message or e.args[1]),
+                    subsequent_indent=' ', width=80)) + '\n')
        return 1

    out.write('Input file parsed, now writing data\n')
@ -433,7 +379,7 @@ def main(args, out, err):
        return 1

    try:
-        writer.write(cldrVersion, defaultDict, windowsIdDict)
+        writer.write(version, defaults, winIds)
    except Error as e:
        writer.cleanup()
        err.write('\nError in Windows ID data: ' + e.message + '\n')
--- a/util/locale_database/ldml.py
+++ b/util/locale_database/ldml.py
@ -0,0 +1,140 @@
+#############################################################################
+##
+## Copyright (C) 2020 The Qt Company Ltd.
+## Contact: https://www.qt.io/licensing/
+##
+## This file is part of the test suite of the Qt Toolkit.
+##
+## $QT_BEGIN_LICENSE:GPL-EXCEPT$
+## Commercial License Usage
+## Licensees holding valid commercial Qt licenses may use this file in
+## accordance with the commercial license agreement provided with the
+## Software or, alternatively, in accordance with the terms contained in
+## a written agreement between you and The Qt Company. For licensing terms
+## and conditions see https://www.qt.io/terms-conditions. For further
+## information use the contact form at https://www.qt.io/contact-us.
+##
+## GNU General Public License Usage
+## Alternatively, this file may be used under the terms of the GNU
+## General Public License version 3 as published by the Free Software
+## Foundation with exceptions as appearing in the file LICENSE.GPL3-EXCEPT
+## included in the packaging of this file. Please review the following
+## information to ensure the GNU General Public License requirements will
+## be met: https://www.gnu.org/licenses/gpl-3.0.html.
+##
+## $QT_END_LICENSE$
+##
+#############################################################################
+"""Parsing the Locale Data Markup Language
+
+It's an XML format, so the raw parsing of XML is, of course, delegated
+to xml.dom.minidom; but it has its own specific schemata and some
+funky rules for combining data from various files (inheritance between
+locales). The use of it we're interested in is extraction of CLDR's
+data, so some of the material here is specific to CLDR; see cldr.py
+for how it is mainly used.
+
+Provides various classes to wrap xml.dom's objects, specifically those
+returned by minidom.parse() and their child-nodes:
+  Node -- wraps any node in the DOM tree
+  XmlScanner -- wraps the root element of a stand-alone XML file
+  Supplement -- specializes XmlScanner for supplemental data files
+
+See individual classes for further detail.
+"""
+from localetools import Error
+
+class Node (object):
+    """Wrapper for an arbitrary DOM node.
+
+    Provides various ways to select chldren of a node. Selected child
+    nodes are returned wrapped as Node objects.  A Node exposes the
+    raw DOM node it wraps via its .dom attribute."""
+
+    def __init__(self, elt):
+        """Wraps a DOM node for ease of access.
+
+        Single argument, elt, is the DOM node to wrap."""
+        self.dom = elt
+
+    def findAllChildren(self, tag, wanted = None):
+        """All children that do have the given tag and attributes.
+
+        First argument is the tag: children with any other tag are
+        ignored.
+
+        Optional second argument, wanted, should either be None or map
+        attribute names to the values they must have. Only child nodes
+        with these attributes set to the given values are yielded."""
+
+        cutoff = 4 # Only accept approved, for now
+        for child in self.dom.childNodes:
+            if child.nodeType != child.ELEMENT_NODE:
+                continue
+            if child.nodeName != tag:
+                continue
+
+            try:
+                draft = child.attributes['draft']
+            except KeyError:
+                pass
+            else:
+                if self.__draftScores.get(draft, 0) < cutoff:
+                    continue
+
+            if wanted is not None:
+                try:
+                    if wanted and any(child.attributes[k].nodeValue != v for k, v in wanted.items()):
+                        continue
+                except KeyError: # Some wanted attribute is missing
+                    continue
+
+            yield Node(child)
+
+    __draftScores = dict(true = 0, unconfirmed = 1, provisional = 2,
+                         contributed = 3, approved = 4, false = 4)
+
+def _parseXPath(selector):
+    # Split "tag[attr=val][...]" into tag-name and attribute mapping
+    attrs = selector.split('[')
+    name = attrs.pop(0)
+    if attrs:
+        attrs = [x.strip() for x in attrs]
+        assert all(x.endswith(']') for x in attrs)
+        attrs = [x[:-1].split('=') for x in attrs]
+        assert all(len(x) in (1, 2) for x in attrs)
+        attrs = (('type', x[0]) if len(x) == 1 else x for x in attrs)
+    return name, dict(attrs)
+
+def _iterateEach(iters):
+    # Flatten a two-layer iterator.
+    for it in iters:
+        for item in it:
+            yield item
+
+class XmlScanner (object):
+    """Wrap an XML file to enable XPath access to its nodes.
+    """
+    def __init__(self, node):
+        self.root = node
+
+    def findNodes(self, xpath):
+        """Return all nodes under self.root matching this xpath"""
+        elts = (self.root,)
+        for selector in xpath.split('/'):
+            tag, attrs = _parseXPath(selector)
+            elts = tuple(_iterateEach(e.findAllChildren(tag, attrs) for e in elts))
+            if not elts:
+                break
+        return elts
+
+class Supplement (XmlScanner):
+    # Replaces xpathlite.findTagsInFile()
+    def find(self, xpath):
+        elts = self.findNodes(xpath)
+        for elt in _iterateEach(e.dom.childNodes if e.dom.childNodes else (e.dom,)
+                                for e in elts):
+            if elt.attributes:
+                yield (elt.nodeName,
+                       dict((k, v if isinstance(v, basestring) else v.nodeValue)
+                            for k, v in elt.attributes.items()))