From c834dbc6fb8881f543e2a599afbc23ee1277483d Mon Sep 17 00:00:00 2001 From: Edward Welbourne Date: Thu, 27 Feb 2020 10:56:36 +0100 Subject: Move cldr2qtimezone.py's CLDR-reading to a CldrAccess class This begins the process of replacing xpathlite.py, adding low-level DOM-access classes to ldml.py and the CldrAccess class to cldr.py Moved a format comment from cldr2qtimezone.py's doc-string to the method of CldrAccess that does the actual reading. Task-number: QTBUG-81344 Change-Id: I46ae3f402f8207ced6d30a1de5cedaeef47b2bcf Reviewed-by: Cristian Maureira-Fredes --- util/locale_database/cldr.py | 182 +++++++++++++++++++++++++++++++++ util/locale_database/cldr2qtimezone.py | 88 +++------------- util/locale_database/ldml.py | 140 +++++++++++++++++++++++++ 3 files changed, 339 insertions(+), 71 deletions(-) create mode 100644 util/locale_database/cldr.py create mode 100644 util/locale_database/ldml.py (limited to 'util') diff --git a/util/locale_database/cldr.py b/util/locale_database/cldr.py new file mode 100644 index 0000000000..7890adf307 --- /dev/null +++ b/util/locale_database/cldr.py @@ -0,0 +1,182 @@ +############################################################################# +## +## Copyright (C) 2020 The Qt Company Ltd. +## Contact: https://www.qt.io/licensing/ +## +## This file is part of the test suite of the Qt Toolkit. +## +## $QT_BEGIN_LICENSE:GPL-EXCEPT$ +## Commercial License Usage +## Licensees holding valid commercial Qt licenses may use this file in +## accordance with the commercial license agreement provided with the +## Software or, alternatively, in accordance with the terms contained in +## a written agreement between you and The Qt Company. For licensing terms +## and conditions see https://www.qt.io/terms-conditions. For further +## information use the contact form at https://www.qt.io/contact-us. +## +## GNU General Public License Usage +## Alternatively, this file may be used under the terms of the GNU +## General Public License version 3 as published by the Free Software +## Foundation with exceptions as appearing in the file LICENSE.GPL3-EXCEPT +## included in the packaging of this file. Please review the following +## information to ensure the GNU General Public License requirements will +## be met: https://www.gnu.org/licenses/gpl-3.0.html. +## +## $QT_END_LICENSE$ +## +############################################################################# +"""Digesting the CLDR's data. + +Provides two class: + CldrAccess -- used by the reader to access the tree of data files + +The former should normally be all you need to access. +See individual classes for further detail. +""" + +from xml.dom import minidom +from weakref import WeakValueDictionary as CacheDict +import os + +from localetools import Error +from ldml import Node, Supplement + +class CldrAccess (object): + def __init__(self, root): + """Set up a master object for accessing CLDR data. + + Single parameter, root, is the file-system path to the root of + the unpacked CLDR archive; its common/ sub-directory should + contain dtd/, main/ and supplemental/ sub-directories.""" + self.root = root + + def supplement(self, name): + """Loads supplemental data as a Supplement object. + + The name should be that of a file in common/supplemental/, without path. + """ + return Supplement(Node(self.__xml(('common', 'supplemental', name)))) + + def readWindowsTimeZones(self, lookup): # For use by cldr2qtimezone.py + """Digest CLDR's MS-Win time-zone name mapping. + + MS-Win have their own eccentric names for time-zones. CLDR + helpfully provides a translation to more orthodox names. + + Singe argument, lookup, is a mapping from known MS-Win names + for locales to a unique integer index (starting at 1). + + The XML structure we read has the form: + + + + + + + + + + + + +""" + zones = self.supplement('windowsZones.xml') + enum = self.__enumMap('country') + badZones, unLands, defaults, windows = set(), set(), {}, {} + + for name, attrs in zones.find('windowsZones/mapTimezones'): + if name != 'mapZone': + continue + + wid, code = attrs['other'], attrs['territory'] + data = dict(windowsId = wid, + countryCode = code, + ianaList = attrs['type']) + + try: + key = lookup[wid] + except KeyError: + badZones.add(wid) + key = 0 + data['windowsKey'] = key + + if code == u'001': + defaults[key] = data['ianaList'] + else: + try: + cid, name = enum[code] + except KeyError: + unLands.append(code) + continue + data.update(countryId = cid, country = name) + windows[key, cid] = data + + if unLands: + raise Error('Unknown country codes, please add to enumdata.py: ' + + ', '.join(sorted(unLands))) + + if badZones: + raise Error('Unknown Windows IDs, please add to cldr2qtimezone.py: ' + + ', '.join(sorted(badZones))) + + return self.cldrVersion, defaults, windows + + @property + def cldrVersion(self): + # Evaluate so as to ensure __cldrVersion is set: + self.__scanLdmlDtd() + return self.__cldrVersion + + # Implementation details + def __xml(self, path, cache = CacheDict(), read = minidom.parse, joinPath = os.path.join): + try: + doc = cache[path] + except KeyError: + cache[path] = doc = read(joinPath(self.root, *path)).documentElement + return doc + + def __open(self, path, joinPath=os.path.join): + return open(joinPath(self.root, *path)) + + @property + def __supplementalData(self, cache = []): + if not cache: + cache.append(self.supplement('supplementalData.xml')) + return cache[0] + + def __scanLdmlDtd(self, joinPath = os.path.join): + """Scan the LDML DTD, record CLDR version.""" + with self.__open(('common', 'dtd', 'ldml.dtd')) as dtd: + for line in dtd: + if line.startswith(' might be stuck on its end: + self.__cldrVersion = parts[5].split('"')[1] + break + + def __enumMap(self, key, cache = {}): + if not cache: + cache['variant'] = {'': (0, 'This should never be seen outside ldml.py')} + # They're not actually lists: mappings from numeric value + # to pairs of full name and short code. What we want, in + # each case, is a mapping from code to the other two. + from enumdata import language_list, script_list, country_list + for form, book, empty in (('language', language_list, 'AnyLanguage'), + ('script', script_list, 'AnyScript'), + ('country', country_list, 'AnyCountry')): + cache[form] = dict((pair[1], (num, pair[0])) + for num, pair in book.items() if pair[0] != 'C') + # (Have to filter out the C locale, as we give it the + # same (all space) code as AnyLanguage, whose code + # should probably be 'und' instead.) + + # Map empty to zero and the any value: + cache[form][''] = (0, empty) + # and map language code 'und' also to (0, any): + cache['language']['und'] = (0, 'AnyLanguage') + + return cache[key] + +# Unpolute the namespace: we don't need to export these. +del minidom, CacheDict, os diff --git a/util/locale_database/cldr2qtimezone.py b/util/locale_database/cldr2qtimezone.py index f2d2003d53..70b5d1e69e 100755 --- a/util/locale_database/cldr2qtimezone.py +++ b/util/locale_database/cldr2qtimezone.py @@ -34,32 +34,15 @@ the CLDR data. Pass its common/ directory as first parameter to this script and the qtbase root directory as second parameter. It shall update qtbase's src/corelib/time/qtimezoneprivate_data_p.h ready for use. - -The XML structure we read has the form: - - - - - - - - - - - - - - """ import os import re import datetime +import textwrap -import enumdata from localetools import unicode2hex, wrap_list, Error, SourceFileEditor -from xpathlite import DraftResolution, findAlias, findEntry, findTagsInFile, \ - _findEntryInFile as findEntryInFile +from cldr import CldrAccess ### Data that may need updates in response to new entries in the CLDR file ### @@ -351,10 +334,10 @@ def main(args, out, err): """Parses CLDR's data and updates Qt's representation of it. Takes sys.argv, sys.stdout, sys.stderr (or equivalents) as - arguments. Expects two command-line options: the common/ - subdirectory of the unpacked CLDR data-file tree and the root of - the qtbase module's checkout. Updates QTimeZone's private data - about Windows time-zone IDs.""" + arguments. Expects two command-line options: the root of the + unpacked CLDR data-file tree and the root of the qtbase module's + checkout. Updates QTimeZone's private data about Windows time-zone + IDs.""" name = args.pop(0) if len(args) != 2: usage(err, name, "Expected two arguments") @@ -375,54 +358,17 @@ def main(args, out, err): usage(err, name, 'No such file: ' + dataFilePath) return 1 - windowsZonesPath = cldrPath + "/supplemental/windowsZones.xml" - if not os.path.isfile(windowsZonesPath): - usage(err, name, 'Failed to find CLDR data file: ' + windowsZonesPath) - return 1 - - cldrVersion = 'unknown' - ldml = open(cldrPath + "/dtd/ldml.dtd", "r") - for line in ldml: - if 'version cldrVersion CDATA #FIXED' in line: - cldrVersion = line.split('"')[1] - - mapTimezones = findTagsInFile(windowsZonesPath, "windowsZones/mapTimezones") - if not mapTimezones: - err.write('Failed to find time-zone data - aborting !\n') + try: + version, defaults, winIds = CldrAccess(cldrPath).readWindowsTimeZones( + dict((name, ind) for ind, name in enumerate((x[0] for x in windowsIdList), 1))) + except IOError as e: + usage(err, name, + 'Failed to open common/supplemental/windowsZones.xml: ' + (e.message or e.args[1])) return 1 - - defaultDict, windowsIdDict = {}, {} - badZones = set() - winIdToIndex = dict((name, ind + 1) for ind, name in enumerate(x[0] for x in windowsIdList)) - for mapZone in mapTimezones: - # [u'mapZone', [(u'territory', u'MH'), (u'other', u'UTC+12'), (u'type', u'Pacific/Majuro Pacific/Kwajalein')]] - if mapZone[0] == u'mapZone': - data = {} - for attribute in mapZone[1]: - if attribute[0] == u'other': - data['windowsId'] = attribute[1] - if attribute[0] == u'territory': - data['countryCode'] = attribute[1] - if attribute[0] == u'type': - data['ianaList'] = attribute[1] - - try: - data['windowsKey'] = winIdToIndex[data['windowsId']] - except KeyError: - badZones.add(data['windowsId']) - - countryId = 0 - if data['countryCode'] == u'001': - defaultDict[data['windowsKey']] = data['ianaList'] - else: - data['countryId'] = enumdata.countryCodeToId(data['countryCode']) - if data['countryId'] < 0: - raise Error('Unknown Country Code "{}"'.format(data['countryCode'])) - data['country'] = enumdata.country_list[data['countryId']][0] - windowsIdDict[data['windowsKey'], data['countryId']] = data - if badZones: - err.write('\n\t'.join(["\nUnknown Windows ID, please add:"] + sorted(badZones)) - + "\nto the windowsIdList in cldr2qtimezone.py\n\n") + except Error as e: + err.write('\n'.join(textwrap.wrap( + 'Failed to read windowsZones.xml: ' + (e.message or e.args[1]), + subsequent_indent=' ', width=80)) + '\n') return 1 out.write('Input file parsed, now writing data\n') @@ -433,7 +379,7 @@ def main(args, out, err): return 1 try: - writer.write(cldrVersion, defaultDict, windowsIdDict) + writer.write(version, defaults, winIds) except Error as e: writer.cleanup() err.write('\nError in Windows ID data: ' + e.message + '\n') diff --git a/util/locale_database/ldml.py b/util/locale_database/ldml.py new file mode 100644 index 0000000000..4aaa728a86 --- /dev/null +++ b/util/locale_database/ldml.py @@ -0,0 +1,140 @@ +############################################################################# +## +## Copyright (C) 2020 The Qt Company Ltd. +## Contact: https://www.qt.io/licensing/ +## +## This file is part of the test suite of the Qt Toolkit. +## +## $QT_BEGIN_LICENSE:GPL-EXCEPT$ +## Commercial License Usage +## Licensees holding valid commercial Qt licenses may use this file in +## accordance with the commercial license agreement provided with the +## Software or, alternatively, in accordance with the terms contained in +## a written agreement between you and The Qt Company. For licensing terms +## and conditions see https://www.qt.io/terms-conditions. For further +## information use the contact form at https://www.qt.io/contact-us. +## +## GNU General Public License Usage +## Alternatively, this file may be used under the terms of the GNU +## General Public License version 3 as published by the Free Software +## Foundation with exceptions as appearing in the file LICENSE.GPL3-EXCEPT +## included in the packaging of this file. Please review the following +## information to ensure the GNU General Public License requirements will +## be met: https://www.gnu.org/licenses/gpl-3.0.html. +## +## $QT_END_LICENSE$ +## +############################################################################# +"""Parsing the Locale Data Markup Language + +It's an XML format, so the raw parsing of XML is, of course, delegated +to xml.dom.minidom; but it has its own specific schemata and some +funky rules for combining data from various files (inheritance between +locales). The use of it we're interested in is extraction of CLDR's +data, so some of the material here is specific to CLDR; see cldr.py +for how it is mainly used. + +Provides various classes to wrap xml.dom's objects, specifically those +returned by minidom.parse() and their child-nodes: + Node -- wraps any node in the DOM tree + XmlScanner -- wraps the root element of a stand-alone XML file + Supplement -- specializes XmlScanner for supplemental data files + +See individual classes for further detail. +""" +from localetools import Error + +class Node (object): + """Wrapper for an arbitrary DOM node. + + Provides various ways to select chldren of a node. Selected child + nodes are returned wrapped as Node objects. A Node exposes the + raw DOM node it wraps via its .dom attribute.""" + + def __init__(self, elt): + """Wraps a DOM node for ease of access. + + Single argument, elt, is the DOM node to wrap.""" + self.dom = elt + + def findAllChildren(self, tag, wanted = None): + """All children that do have the given tag and attributes. + + First argument is the tag: children with any other tag are + ignored. + + Optional second argument, wanted, should either be None or map + attribute names to the values they must have. Only child nodes + with these attributes set to the given values are yielded.""" + + cutoff = 4 # Only accept approved, for now + for child in self.dom.childNodes: + if child.nodeType != child.ELEMENT_NODE: + continue + if child.nodeName != tag: + continue + + try: + draft = child.attributes['draft'] + except KeyError: + pass + else: + if self.__draftScores.get(draft, 0) < cutoff: + continue + + if wanted is not None: + try: + if wanted and any(child.attributes[k].nodeValue != v for k, v in wanted.items()): + continue + except KeyError: # Some wanted attribute is missing + continue + + yield Node(child) + + __draftScores = dict(true = 0, unconfirmed = 1, provisional = 2, + contributed = 3, approved = 4, false = 4) + +def _parseXPath(selector): + # Split "tag[attr=val][...]" into tag-name and attribute mapping + attrs = selector.split('[') + name = attrs.pop(0) + if attrs: + attrs = [x.strip() for x in attrs] + assert all(x.endswith(']') for x in attrs) + attrs = [x[:-1].split('=') for x in attrs] + assert all(len(x) in (1, 2) for x in attrs) + attrs = (('type', x[0]) if len(x) == 1 else x for x in attrs) + return name, dict(attrs) + +def _iterateEach(iters): + # Flatten a two-layer iterator. + for it in iters: + for item in it: + yield item + +class XmlScanner (object): + """Wrap an XML file to enable XPath access to its nodes. + """ + def __init__(self, node): + self.root = node + + def findNodes(self, xpath): + """Return all nodes under self.root matching this xpath""" + elts = (self.root,) + for selector in xpath.split('/'): + tag, attrs = _parseXPath(selector) + elts = tuple(_iterateEach(e.findAllChildren(tag, attrs) for e in elts)) + if not elts: + break + return elts + +class Supplement (XmlScanner): + # Replaces xpathlite.findTagsInFile() + def find(self, xpath): + elts = self.findNodes(xpath) + for elt in _iterateEach(e.dom.childNodes if e.dom.childNodes else (e.dom,) + for e in elts): + if elt.attributes: + yield (elt.nodeName, + dict((k, v if isinstance(v, basestring) else v.nodeValue) + for k, v in elt.attributes.items())) -- cgit v1.2.3