From c834dbc6fb8881f543e2a599afbc23ee1277483d Mon Sep 17 00:00:00 2001 From: Edward Welbourne Date: Thu, 27 Feb 2020 10:56:36 +0100 Subject: Move cldr2qtimezone.py's CLDR-reading to a CldrAccess class This begins the process of replacing xpathlite.py, adding low-level DOM-access classes to ldml.py and the CldrAccess class to cldr.py Moved a format comment from cldr2qtimezone.py's doc-string to the method of CldrAccess that does the actual reading. Task-number: QTBUG-81344 Change-Id: I46ae3f402f8207ced6d30a1de5cedaeef47b2bcf Reviewed-by: Cristian Maureira-Fredes --- util/locale_database/cldr.py | 182 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 182 insertions(+) create mode 100644 util/locale_database/cldr.py (limited to 'util/locale_database/cldr.py') diff --git a/util/locale_database/cldr.py b/util/locale_database/cldr.py new file mode 100644 index 0000000000..7890adf307 --- /dev/null +++ b/util/locale_database/cldr.py @@ -0,0 +1,182 @@ +############################################################################# +## +## Copyright (C) 2020 The Qt Company Ltd. +## Contact: https://www.qt.io/licensing/ +## +## This file is part of the test suite of the Qt Toolkit. +## +## $QT_BEGIN_LICENSE:GPL-EXCEPT$ +## Commercial License Usage +## Licensees holding valid commercial Qt licenses may use this file in +## accordance with the commercial license agreement provided with the +## Software or, alternatively, in accordance with the terms contained in +## a written agreement between you and The Qt Company. For licensing terms +## and conditions see https://www.qt.io/terms-conditions. For further +## information use the contact form at https://www.qt.io/contact-us. +## +## GNU General Public License Usage +## Alternatively, this file may be used under the terms of the GNU +## General Public License version 3 as published by the Free Software +## Foundation with exceptions as appearing in the file LICENSE.GPL3-EXCEPT +## included in the packaging of this file. Please review the following +## information to ensure the GNU General Public License requirements will +## be met: https://www.gnu.org/licenses/gpl-3.0.html. +## +## $QT_END_LICENSE$ +## +############################################################################# +"""Digesting the CLDR's data. + +Provides two class: + CldrAccess -- used by the reader to access the tree of data files + +The former should normally be all you need to access. +See individual classes for further detail. +""" + +from xml.dom import minidom +from weakref import WeakValueDictionary as CacheDict +import os + +from localetools import Error +from ldml import Node, Supplement + +class CldrAccess (object): + def __init__(self, root): + """Set up a master object for accessing CLDR data. + + Single parameter, root, is the file-system path to the root of + the unpacked CLDR archive; its common/ sub-directory should + contain dtd/, main/ and supplemental/ sub-directories.""" + self.root = root + + def supplement(self, name): + """Loads supplemental data as a Supplement object. + + The name should be that of a file in common/supplemental/, without path. + """ + return Supplement(Node(self.__xml(('common', 'supplemental', name)))) + + def readWindowsTimeZones(self, lookup): # For use by cldr2qtimezone.py + """Digest CLDR's MS-Win time-zone name mapping. + + MS-Win have their own eccentric names for time-zones. CLDR + helpfully provides a translation to more orthodox names. + + Singe argument, lookup, is a mapping from known MS-Win names + for locales to a unique integer index (starting at 1). + + The XML structure we read has the form: + + + + + + + + + + + + +""" + zones = self.supplement('windowsZones.xml') + enum = self.__enumMap('country') + badZones, unLands, defaults, windows = set(), set(), {}, {} + + for name, attrs in zones.find('windowsZones/mapTimezones'): + if name != 'mapZone': + continue + + wid, code = attrs['other'], attrs['territory'] + data = dict(windowsId = wid, + countryCode = code, + ianaList = attrs['type']) + + try: + key = lookup[wid] + except KeyError: + badZones.add(wid) + key = 0 + data['windowsKey'] = key + + if code == u'001': + defaults[key] = data['ianaList'] + else: + try: + cid, name = enum[code] + except KeyError: + unLands.append(code) + continue + data.update(countryId = cid, country = name) + windows[key, cid] = data + + if unLands: + raise Error('Unknown country codes, please add to enumdata.py: ' + + ', '.join(sorted(unLands))) + + if badZones: + raise Error('Unknown Windows IDs, please add to cldr2qtimezone.py: ' + + ', '.join(sorted(badZones))) + + return self.cldrVersion, defaults, windows + + @property + def cldrVersion(self): + # Evaluate so as to ensure __cldrVersion is set: + self.__scanLdmlDtd() + return self.__cldrVersion + + # Implementation details + def __xml(self, path, cache = CacheDict(), read = minidom.parse, joinPath = os.path.join): + try: + doc = cache[path] + except KeyError: + cache[path] = doc = read(joinPath(self.root, *path)).documentElement + return doc + + def __open(self, path, joinPath=os.path.join): + return open(joinPath(self.root, *path)) + + @property + def __supplementalData(self, cache = []): + if not cache: + cache.append(self.supplement('supplementalData.xml')) + return cache[0] + + def __scanLdmlDtd(self, joinPath = os.path.join): + """Scan the LDML DTD, record CLDR version.""" + with self.__open(('common', 'dtd', 'ldml.dtd')) as dtd: + for line in dtd: + if line.startswith(' might be stuck on its end: + self.__cldrVersion = parts[5].split('"')[1] + break + + def __enumMap(self, key, cache = {}): + if not cache: + cache['variant'] = {'': (0, 'This should never be seen outside ldml.py')} + # They're not actually lists: mappings from numeric value + # to pairs of full name and short code. What we want, in + # each case, is a mapping from code to the other two. + from enumdata import language_list, script_list, country_list + for form, book, empty in (('language', language_list, 'AnyLanguage'), + ('script', script_list, 'AnyScript'), + ('country', country_list, 'AnyCountry')): + cache[form] = dict((pair[1], (num, pair[0])) + for num, pair in book.items() if pair[0] != 'C') + # (Have to filter out the C locale, as we give it the + # same (all space) code as AnyLanguage, whose code + # should probably be 'und' instead.) + + # Map empty to zero and the any value: + cache[form][''] = (0, empty) + # and map language code 'und' also to (0, any): + cache['language']['und'] = (0, 'AnyLanguage') + + return cache[key] + +# Unpolute the namespace: we don't need to export these. +del minidom, CacheDict, os -- cgit v1.2.3