summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--util/locale_database/cldr.py182
-rwxr-xr-xutil/locale_database/cldr2qtimezone.py88
-rw-r--r--util/locale_database/ldml.py140
3 files changed, 339 insertions, 71 deletions
diff --git a/util/locale_database/cldr.py b/util/locale_database/cldr.py
new file mode 100644
index 0000000000..7890adf307
--- /dev/null
+++ b/util/locale_database/cldr.py
@@ -0,0 +1,182 @@
+#############################################################################
+##
+## Copyright (C) 2020 The Qt Company Ltd.
+## Contact: https://www.qt.io/licensing/
+##
+## This file is part of the test suite of the Qt Toolkit.
+##
+## $QT_BEGIN_LICENSE:GPL-EXCEPT$
+## Commercial License Usage
+## Licensees holding valid commercial Qt licenses may use this file in
+## accordance with the commercial license agreement provided with the
+## Software or, alternatively, in accordance with the terms contained in
+## a written agreement between you and The Qt Company. For licensing terms
+## and conditions see https://www.qt.io/terms-conditions. For further
+## information use the contact form at https://www.qt.io/contact-us.
+##
+## GNU General Public License Usage
+## Alternatively, this file may be used under the terms of the GNU
+## General Public License version 3 as published by the Free Software
+## Foundation with exceptions as appearing in the file LICENSE.GPL3-EXCEPT
+## included in the packaging of this file. Please review the following
+## information to ensure the GNU General Public License requirements will
+## be met: https://www.gnu.org/licenses/gpl-3.0.html.
+##
+## $QT_END_LICENSE$
+##
+#############################################################################
+"""Digesting the CLDR's data.
+
+Provides two class:
+ CldrAccess -- used by the reader to access the tree of data files
+
+The former should normally be all you need to access.
+See individual classes for further detail.
+"""
+
+from xml.dom import minidom
+from weakref import WeakValueDictionary as CacheDict
+import os
+
+from localetools import Error
+from ldml import Node, Supplement
+
+class CldrAccess (object):
+ def __init__(self, root):
+ """Set up a master object for accessing CLDR data.
+
+ Single parameter, root, is the file-system path to the root of
+ the unpacked CLDR archive; its common/ sub-directory should
+ contain dtd/, main/ and supplemental/ sub-directories."""
+ self.root = root
+
+ def supplement(self, name):
+ """Loads supplemental data as a Supplement object.
+
+ The name should be that of a file in common/supplemental/, without path.
+ """
+ return Supplement(Node(self.__xml(('common', 'supplemental', name))))
+
+ def readWindowsTimeZones(self, lookup): # For use by cldr2qtimezone.py
+ """Digest CLDR's MS-Win time-zone name mapping.
+
+ MS-Win have their own eccentric names for time-zones. CLDR
+ helpfully provides a translation to more orthodox names.
+
+ Singe argument, lookup, is a mapping from known MS-Win names
+ for locales to a unique integer index (starting at 1).
+
+ The XML structure we read has the form:
+
+ <supplementalData>
+ <windowsZones>
+ <mapTimezones otherVersion="..." typeVersion="...">
+ <!-- (UTC-08:00) Pacific Time (US & Canada) -->
+ <mapZone other="Pacific Standard Time" territory="001" type="America/Los_Angeles"/>
+ <mapZone other="Pacific Standard Time" territory="CA" type="America/Vancouver America/Dawson America/Whitehorse"/>
+ <mapZone other="Pacific Standard Time" territory="US" type="America/Los_Angeles America/Metlakatla"/>
+ <mapZone other="Pacific Standard Time" territory="ZZ" type="PST8PDT"/>
+ </mapTimezones>
+ </windowsZones>
+ </supplementalData>
+"""
+ zones = self.supplement('windowsZones.xml')
+ enum = self.__enumMap('country')
+ badZones, unLands, defaults, windows = set(), set(), {}, {}
+
+ for name, attrs in zones.find('windowsZones/mapTimezones'):
+ if name != 'mapZone':
+ continue
+
+ wid, code = attrs['other'], attrs['territory']
+ data = dict(windowsId = wid,
+ countryCode = code,
+ ianaList = attrs['type'])
+
+ try:
+ key = lookup[wid]
+ except KeyError:
+ badZones.add(wid)
+ key = 0
+ data['windowsKey'] = key
+
+ if code == u'001':
+ defaults[key] = data['ianaList']
+ else:
+ try:
+ cid, name = enum[code]
+ except KeyError:
+ unLands.append(code)
+ continue
+ data.update(countryId = cid, country = name)
+ windows[key, cid] = data
+
+ if unLands:
+ raise Error('Unknown country codes, please add to enumdata.py: '
+ + ', '.join(sorted(unLands)))
+
+ if badZones:
+ raise Error('Unknown Windows IDs, please add to cldr2qtimezone.py: '
+ + ', '.join(sorted(badZones)))
+
+ return self.cldrVersion, defaults, windows
+
+ @property
+ def cldrVersion(self):
+ # Evaluate so as to ensure __cldrVersion is set:
+ self.__scanLdmlDtd()
+ return self.__cldrVersion
+
+ # Implementation details
+ def __xml(self, path, cache = CacheDict(), read = minidom.parse, joinPath = os.path.join):
+ try:
+ doc = cache[path]
+ except KeyError:
+ cache[path] = doc = read(joinPath(self.root, *path)).documentElement
+ return doc
+
+ def __open(self, path, joinPath=os.path.join):
+ return open(joinPath(self.root, *path))
+
+ @property
+ def __supplementalData(self, cache = []):
+ if not cache:
+ cache.append(self.supplement('supplementalData.xml'))
+ return cache[0]
+
+ def __scanLdmlDtd(self, joinPath = os.path.join):
+ """Scan the LDML DTD, record CLDR version."""
+ with self.__open(('common', 'dtd', 'ldml.dtd')) as dtd:
+ for line in dtd:
+ if line.startswith('<!ATTLIST '):
+ parts = line.split()
+ if parts[1:5] == ['version', 'cldrVersion', 'CDATA', '#FIXED']:
+ # parts[5] is the version, in quotes, although the final > might be stuck on its end:
+ self.__cldrVersion = parts[5].split('"')[1]
+ break
+
+ def __enumMap(self, key, cache = {}):
+ if not cache:
+ cache['variant'] = {'': (0, 'This should never be seen outside ldml.py')}
+ # They're not actually lists: mappings from numeric value
+ # to pairs of full name and short code. What we want, in
+ # each case, is a mapping from code to the other two.
+ from enumdata import language_list, script_list, country_list
+ for form, book, empty in (('language', language_list, 'AnyLanguage'),
+ ('script', script_list, 'AnyScript'),
+ ('country', country_list, 'AnyCountry')):
+ cache[form] = dict((pair[1], (num, pair[0]))
+ for num, pair in book.items() if pair[0] != 'C')
+ # (Have to filter out the C locale, as we give it the
+ # same (all space) code as AnyLanguage, whose code
+ # should probably be 'und' instead.)
+
+ # Map empty to zero and the any value:
+ cache[form][''] = (0, empty)
+ # and map language code 'und' also to (0, any):
+ cache['language']['und'] = (0, 'AnyLanguage')
+
+ return cache[key]
+
+# Unpolute the namespace: we don't need to export these.
+del minidom, CacheDict, os
diff --git a/util/locale_database/cldr2qtimezone.py b/util/locale_database/cldr2qtimezone.py
index f2d2003d53..70b5d1e69e 100755
--- a/util/locale_database/cldr2qtimezone.py
+++ b/util/locale_database/cldr2qtimezone.py
@@ -34,32 +34,15 @@ the CLDR data. Pass its common/ directory as first parameter to this
script and the qtbase root directory as second parameter. It shall
update qtbase's src/corelib/time/qtimezoneprivate_data_p.h ready for
use.
-
-The XML structure we read has the form:
-
- <supplementalData>
- <version number="$Revision:...$"/>
- <generation date="$Date:...$"/>
- <windowsZones>
- <mapTimezones otherVersion="..." typeVersion="...">
- <!-- (UTC-08:00) Pacific Time (US & Canada) -->
- <mapZone other="Pacific Standard Time" territory="001" type="America/Los_Angeles"/>
- <mapZone other="Pacific Standard Time" territory="CA" type="America/Vancouver America/Dawson America/Whitehorse"/>
- <mapZone other="Pacific Standard Time" territory="US" type="America/Los_Angeles America/Metlakatla"/>
- <mapZone other="Pacific Standard Time" territory="ZZ" type="PST8PDT"/>
- </mapTimezones>
- </windowsZones>
- </supplementalData>
"""
import os
import re
import datetime
+import textwrap
-import enumdata
from localetools import unicode2hex, wrap_list, Error, SourceFileEditor
-from xpathlite import DraftResolution, findAlias, findEntry, findTagsInFile, \
- _findEntryInFile as findEntryInFile
+from cldr import CldrAccess
### Data that may need updates in response to new entries in the CLDR file ###
@@ -351,10 +334,10 @@ def main(args, out, err):
"""Parses CLDR's data and updates Qt's representation of it.
Takes sys.argv, sys.stdout, sys.stderr (or equivalents) as
- arguments. Expects two command-line options: the common/
- subdirectory of the unpacked CLDR data-file tree and the root of
- the qtbase module's checkout. Updates QTimeZone's private data
- about Windows time-zone IDs."""
+ arguments. Expects two command-line options: the root of the
+ unpacked CLDR data-file tree and the root of the qtbase module's
+ checkout. Updates QTimeZone's private data about Windows time-zone
+ IDs."""
name = args.pop(0)
if len(args) != 2:
usage(err, name, "Expected two arguments")
@@ -375,54 +358,17 @@ def main(args, out, err):
usage(err, name, 'No such file: ' + dataFilePath)
return 1
- windowsZonesPath = cldrPath + "/supplemental/windowsZones.xml"
- if not os.path.isfile(windowsZonesPath):
- usage(err, name, 'Failed to find CLDR data file: ' + windowsZonesPath)
- return 1
-
- cldrVersion = 'unknown'
- ldml = open(cldrPath + "/dtd/ldml.dtd", "r")
- for line in ldml:
- if 'version cldrVersion CDATA #FIXED' in line:
- cldrVersion = line.split('"')[1]
-
- mapTimezones = findTagsInFile(windowsZonesPath, "windowsZones/mapTimezones")
- if not mapTimezones:
- err.write('Failed to find time-zone data - aborting !\n')
+ try:
+ version, defaults, winIds = CldrAccess(cldrPath).readWindowsTimeZones(
+ dict((name, ind) for ind, name in enumerate((x[0] for x in windowsIdList), 1)))
+ except IOError as e:
+ usage(err, name,
+ 'Failed to open common/supplemental/windowsZones.xml: ' + (e.message or e.args[1]))
return 1
-
- defaultDict, windowsIdDict = {}, {}
- badZones = set()
- winIdToIndex = dict((name, ind + 1) for ind, name in enumerate(x[0] for x in windowsIdList))
- for mapZone in mapTimezones:
- # [u'mapZone', [(u'territory', u'MH'), (u'other', u'UTC+12'), (u'type', u'Pacific/Majuro Pacific/Kwajalein')]]
- if mapZone[0] == u'mapZone':
- data = {}
- for attribute in mapZone[1]:
- if attribute[0] == u'other':
- data['windowsId'] = attribute[1]
- if attribute[0] == u'territory':
- data['countryCode'] = attribute[1]
- if attribute[0] == u'type':
- data['ianaList'] = attribute[1]
-
- try:
- data['windowsKey'] = winIdToIndex[data['windowsId']]
- except KeyError:
- badZones.add(data['windowsId'])
-
- countryId = 0
- if data['countryCode'] == u'001':
- defaultDict[data['windowsKey']] = data['ianaList']
- else:
- data['countryId'] = enumdata.countryCodeToId(data['countryCode'])
- if data['countryId'] < 0:
- raise Error('Unknown Country Code "{}"'.format(data['countryCode']))
- data['country'] = enumdata.country_list[data['countryId']][0]
- windowsIdDict[data['windowsKey'], data['countryId']] = data
- if badZones:
- err.write('\n\t'.join(["\nUnknown Windows ID, please add:"] + sorted(badZones))
- + "\nto the windowsIdList in cldr2qtimezone.py\n\n")
+ except Error as e:
+ err.write('\n'.join(textwrap.wrap(
+ 'Failed to read windowsZones.xml: ' + (e.message or e.args[1]),
+ subsequent_indent=' ', width=80)) + '\n')
return 1
out.write('Input file parsed, now writing data\n')
@@ -433,7 +379,7 @@ def main(args, out, err):
return 1
try:
- writer.write(cldrVersion, defaultDict, windowsIdDict)
+ writer.write(version, defaults, winIds)
except Error as e:
writer.cleanup()
err.write('\nError in Windows ID data: ' + e.message + '\n')
diff --git a/util/locale_database/ldml.py b/util/locale_database/ldml.py
new file mode 100644
index 0000000000..4aaa728a86
--- /dev/null
+++ b/util/locale_database/ldml.py
@@ -0,0 +1,140 @@
+#############################################################################
+##
+## Copyright (C) 2020 The Qt Company Ltd.
+## Contact: https://www.qt.io/licensing/
+##
+## This file is part of the test suite of the Qt Toolkit.
+##
+## $QT_BEGIN_LICENSE:GPL-EXCEPT$
+## Commercial License Usage
+## Licensees holding valid commercial Qt licenses may use this file in
+## accordance with the commercial license agreement provided with the
+## Software or, alternatively, in accordance with the terms contained in
+## a written agreement between you and The Qt Company. For licensing terms
+## and conditions see https://www.qt.io/terms-conditions. For further
+## information use the contact form at https://www.qt.io/contact-us.
+##
+## GNU General Public License Usage
+## Alternatively, this file may be used under the terms of the GNU
+## General Public License version 3 as published by the Free Software
+## Foundation with exceptions as appearing in the file LICENSE.GPL3-EXCEPT
+## included in the packaging of this file. Please review the following
+## information to ensure the GNU General Public License requirements will
+## be met: https://www.gnu.org/licenses/gpl-3.0.html.
+##
+## $QT_END_LICENSE$
+##
+#############################################################################
+"""Parsing the Locale Data Markup Language
+
+It's an XML format, so the raw parsing of XML is, of course, delegated
+to xml.dom.minidom; but it has its own specific schemata and some
+funky rules for combining data from various files (inheritance between
+locales). The use of it we're interested in is extraction of CLDR's
+data, so some of the material here is specific to CLDR; see cldr.py
+for how it is mainly used.
+
+Provides various classes to wrap xml.dom's objects, specifically those
+returned by minidom.parse() and their child-nodes:
+ Node -- wraps any node in the DOM tree
+ XmlScanner -- wraps the root element of a stand-alone XML file
+ Supplement -- specializes XmlScanner for supplemental data files
+
+See individual classes for further detail.
+"""
+from localetools import Error
+
+class Node (object):
+ """Wrapper for an arbitrary DOM node.
+
+ Provides various ways to select chldren of a node. Selected child
+ nodes are returned wrapped as Node objects. A Node exposes the
+ raw DOM node it wraps via its .dom attribute."""
+
+ def __init__(self, elt):
+ """Wraps a DOM node for ease of access.
+
+ Single argument, elt, is the DOM node to wrap."""
+ self.dom = elt
+
+ def findAllChildren(self, tag, wanted = None):
+ """All children that do have the given tag and attributes.
+
+ First argument is the tag: children with any other tag are
+ ignored.
+
+ Optional second argument, wanted, should either be None or map
+ attribute names to the values they must have. Only child nodes
+ with these attributes set to the given values are yielded."""
+
+ cutoff = 4 # Only accept approved, for now
+ for child in self.dom.childNodes:
+ if child.nodeType != child.ELEMENT_NODE:
+ continue
+ if child.nodeName != tag:
+ continue
+
+ try:
+ draft = child.attributes['draft']
+ except KeyError:
+ pass
+ else:
+ if self.__draftScores.get(draft, 0) < cutoff:
+ continue
+
+ if wanted is not None:
+ try:
+ if wanted and any(child.attributes[k].nodeValue != v for k, v in wanted.items()):
+ continue
+ except KeyError: # Some wanted attribute is missing
+ continue
+
+ yield Node(child)
+
+ __draftScores = dict(true = 0, unconfirmed = 1, provisional = 2,
+ contributed = 3, approved = 4, false = 4)
+
+def _parseXPath(selector):
+ # Split "tag[attr=val][...]" into tag-name and attribute mapping
+ attrs = selector.split('[')
+ name = attrs.pop(0)
+ if attrs:
+ attrs = [x.strip() for x in attrs]
+ assert all(x.endswith(']') for x in attrs)
+ attrs = [x[:-1].split('=') for x in attrs]
+ assert all(len(x) in (1, 2) for x in attrs)
+ attrs = (('type', x[0]) if len(x) == 1 else x for x in attrs)
+ return name, dict(attrs)
+
+def _iterateEach(iters):
+ # Flatten a two-layer iterator.
+ for it in iters:
+ for item in it:
+ yield item
+
+class XmlScanner (object):
+ """Wrap an XML file to enable XPath access to its nodes.
+ """
+ def __init__(self, node):
+ self.root = node
+
+ def findNodes(self, xpath):
+ """Return all nodes under self.root matching this xpath"""
+ elts = (self.root,)
+ for selector in xpath.split('/'):
+ tag, attrs = _parseXPath(selector)
+ elts = tuple(_iterateEach(e.findAllChildren(tag, attrs) for e in elts))
+ if not elts:
+ break
+ return elts
+
+class Supplement (XmlScanner):
+ # Replaces xpathlite.findTagsInFile()
+ def find(self, xpath):
+ elts = self.findNodes(xpath)
+ for elt in _iterateEach(e.dom.childNodes if e.dom.childNodes else (e.dom,)
+ for e in elts):
+ if elt.attributes:
+ yield (elt.nodeName,
+ dict((k, v if isinstance(v, basestring) else v.nodeValue)
+ for k, v in elt.attributes.items()))