diff options
author | Edward Welbourne <edward.welbourne@qt.io> | 2020-02-27 10:56:36 +0100 |
---|---|---|
committer | Edward Welbourne <eddy@chaos.org.uk> | 2020-04-02 19:43:13 +0100 |
commit | c834dbc6fb8881f543e2a599afbc23ee1277483d (patch) | |
tree | d860c83f0ae1fc5751062f6b94ffa67ab014f261 /util/locale_database/cldr.py | |
parent | 9fab53a51317a1692ceb0069f212339bb0dd8780 (diff) |
Move cldr2qtimezone.py's CLDR-reading to a CldrAccess class
This begins the process of replacing xpathlite.py, adding low-level
DOM-access classes to ldml.py and the CldrAccess class to cldr.py
Moved a format comment from cldr2qtimezone.py's doc-string to the
method of CldrAccess that does the actual reading.
Task-number: QTBUG-81344
Change-Id: I46ae3f402f8207ced6d30a1de5cedaeef47b2bcf
Reviewed-by: Cristian Maureira-Fredes <cristian.maureira-fredes@qt.io>
Diffstat (limited to 'util/locale_database/cldr.py')
-rw-r--r-- | util/locale_database/cldr.py | 182 |
1 files changed, 182 insertions, 0 deletions
diff --git a/util/locale_database/cldr.py b/util/locale_database/cldr.py new file mode 100644 index 0000000000..7890adf307 --- /dev/null +++ b/util/locale_database/cldr.py @@ -0,0 +1,182 @@ +############################################################################# +## +## Copyright (C) 2020 The Qt Company Ltd. +## Contact: https://www.qt.io/licensing/ +## +## This file is part of the test suite of the Qt Toolkit. +## +## $QT_BEGIN_LICENSE:GPL-EXCEPT$ +## Commercial License Usage +## Licensees holding valid commercial Qt licenses may use this file in +## accordance with the commercial license agreement provided with the +## Software or, alternatively, in accordance with the terms contained in +## a written agreement between you and The Qt Company. For licensing terms +## and conditions see https://www.qt.io/terms-conditions. For further +## information use the contact form at https://www.qt.io/contact-us. +## +## GNU General Public License Usage +## Alternatively, this file may be used under the terms of the GNU +## General Public License version 3 as published by the Free Software +## Foundation with exceptions as appearing in the file LICENSE.GPL3-EXCEPT +## included in the packaging of this file. Please review the following +## information to ensure the GNU General Public License requirements will +## be met: https://www.gnu.org/licenses/gpl-3.0.html. +## +## $QT_END_LICENSE$ +## +############################################################################# +"""Digesting the CLDR's data. + +Provides two class: + CldrAccess -- used by the reader to access the tree of data files + +The former should normally be all you need to access. +See individual classes for further detail. +""" + +from xml.dom import minidom +from weakref import WeakValueDictionary as CacheDict +import os + +from localetools import Error +from ldml import Node, Supplement + +class CldrAccess (object): + def __init__(self, root): + """Set up a master object for accessing CLDR data. + + Single parameter, root, is the file-system path to the root of + the unpacked CLDR archive; its common/ sub-directory should + contain dtd/, main/ and supplemental/ sub-directories.""" + self.root = root + + def supplement(self, name): + """Loads supplemental data as a Supplement object. + + The name should be that of a file in common/supplemental/, without path. + """ + return Supplement(Node(self.__xml(('common', 'supplemental', name)))) + + def readWindowsTimeZones(self, lookup): # For use by cldr2qtimezone.py + """Digest CLDR's MS-Win time-zone name mapping. + + MS-Win have their own eccentric names for time-zones. CLDR + helpfully provides a translation to more orthodox names. + + Singe argument, lookup, is a mapping from known MS-Win names + for locales to a unique integer index (starting at 1). + + The XML structure we read has the form: + + <supplementalData> + <windowsZones> + <mapTimezones otherVersion="..." typeVersion="..."> + <!-- (UTC-08:00) Pacific Time (US & Canada) --> + <mapZone other="Pacific Standard Time" territory="001" type="America/Los_Angeles"/> + <mapZone other="Pacific Standard Time" territory="CA" type="America/Vancouver America/Dawson America/Whitehorse"/> + <mapZone other="Pacific Standard Time" territory="US" type="America/Los_Angeles America/Metlakatla"/> + <mapZone other="Pacific Standard Time" territory="ZZ" type="PST8PDT"/> + </mapTimezones> + </windowsZones> + </supplementalData> +""" + zones = self.supplement('windowsZones.xml') + enum = self.__enumMap('country') + badZones, unLands, defaults, windows = set(), set(), {}, {} + + for name, attrs in zones.find('windowsZones/mapTimezones'): + if name != 'mapZone': + continue + + wid, code = attrs['other'], attrs['territory'] + data = dict(windowsId = wid, + countryCode = code, + ianaList = attrs['type']) + + try: + key = lookup[wid] + except KeyError: + badZones.add(wid) + key = 0 + data['windowsKey'] = key + + if code == u'001': + defaults[key] = data['ianaList'] + else: + try: + cid, name = enum[code] + except KeyError: + unLands.append(code) + continue + data.update(countryId = cid, country = name) + windows[key, cid] = data + + if unLands: + raise Error('Unknown country codes, please add to enumdata.py: ' + + ', '.join(sorted(unLands))) + + if badZones: + raise Error('Unknown Windows IDs, please add to cldr2qtimezone.py: ' + + ', '.join(sorted(badZones))) + + return self.cldrVersion, defaults, windows + + @property + def cldrVersion(self): + # Evaluate so as to ensure __cldrVersion is set: + self.__scanLdmlDtd() + return self.__cldrVersion + + # Implementation details + def __xml(self, path, cache = CacheDict(), read = minidom.parse, joinPath = os.path.join): + try: + doc = cache[path] + except KeyError: + cache[path] = doc = read(joinPath(self.root, *path)).documentElement + return doc + + def __open(self, path, joinPath=os.path.join): + return open(joinPath(self.root, *path)) + + @property + def __supplementalData(self, cache = []): + if not cache: + cache.append(self.supplement('supplementalData.xml')) + return cache[0] + + def __scanLdmlDtd(self, joinPath = os.path.join): + """Scan the LDML DTD, record CLDR version.""" + with self.__open(('common', 'dtd', 'ldml.dtd')) as dtd: + for line in dtd: + if line.startswith('<!ATTLIST '): + parts = line.split() + if parts[1:5] == ['version', 'cldrVersion', 'CDATA', '#FIXED']: + # parts[5] is the version, in quotes, although the final > might be stuck on its end: + self.__cldrVersion = parts[5].split('"')[1] + break + + def __enumMap(self, key, cache = {}): + if not cache: + cache['variant'] = {'': (0, 'This should never be seen outside ldml.py')} + # They're not actually lists: mappings from numeric value + # to pairs of full name and short code. What we want, in + # each case, is a mapping from code to the other two. + from enumdata import language_list, script_list, country_list + for form, book, empty in (('language', language_list, 'AnyLanguage'), + ('script', script_list, 'AnyScript'), + ('country', country_list, 'AnyCountry')): + cache[form] = dict((pair[1], (num, pair[0])) + for num, pair in book.items() if pair[0] != 'C') + # (Have to filter out the C locale, as we give it the + # same (all space) code as AnyLanguage, whose code + # should probably be 'und' instead.) + + # Map empty to zero and the any value: + cache[form][''] = (0, empty) + # and map language code 'und' also to (0, any): + cache['language']['und'] = (0, 'AnyLanguage') + + return cache[key] + +# Unpolute the namespace: we don't need to export these. +del minidom, CacheDict, os |