diff options
Diffstat (limited to 'util/locale_database/cldr.py')
-rw-r--r-- | util/locale_database/cldr.py | 516 |
1 files changed, 305 insertions, 211 deletions
diff --git a/util/locale_database/cldr.py b/util/locale_database/cldr.py index f2b6616fce..75d687dd11 100644 --- a/util/locale_database/cldr.py +++ b/util/locale_database/cldr.py @@ -1,31 +1,5 @@ -# -*- coding: utf-8; -*- -############################################################################# -## -## Copyright (C) 2020 The Qt Company Ltd. -## Contact: https://www.qt.io/licensing/ -## -## This file is part of the test suite of the Qt Toolkit. -## -## $QT_BEGIN_LICENSE:GPL-EXCEPT$ -## Commercial License Usage -## Licensees holding valid commercial Qt licenses may use this file in -## accordance with the commercial license agreement provided with the -## Software or, alternatively, in accordance with the terms contained in -## a written agreement between you and The Qt Company. For licensing terms -## and conditions see https://www.qt.io/terms-conditions. For further -## information use the contact form at https://www.qt.io/contact-us. -## -## GNU General Public License Usage -## Alternatively, this file may be used under the terms of the GNU -## General Public License version 3 as published by the Free Software -## Foundation with exceptions as appearing in the file LICENSE.GPL3-EXCEPT -## included in the packaging of this file. Please review the following -## information to ensure the GNU General Public License requirements will -## be met: https://www.gnu.org/licenses/gpl-3.0.html. -## -## $QT_END_LICENSE$ -## -############################################################################# +# Copyright (C) 2021 The Qt Company Ltd. +# SPDX-License-Identifier: LicenseRef-Qt-Commercial OR GPL-3.0-only WITH Qt-GPL-exception-1.0 """Digesting the CLDR's data. Provides two classes: @@ -36,15 +10,17 @@ The former should normally be all you need to access. See individual classes for further detail. """ +from typing import Iterable, TextIO from xml.dom import minidom from weakref import WeakValueDictionary as CacheDict -import os +from pathlib import Path from ldml import Error, Node, XmlScanner, Supplement, LocaleScanner +from localetools import names_clash from qlocalexml import Locale class CldrReader (object): - def __init__(self, root, grumble = lambda msg: None, whitter = lambda msg: None): + def __init__(self, root: Path, grumble = lambda msg: None, whitter = lambda msg: None): """Set up a reader object for reading CLDR data. Single parameter, root, is the file-system path to the root of @@ -67,7 +43,7 @@ class CldrReader (object): Yields pairs (have, give) of 4-tuples; if what you have matches the left member, giving the right member is probably sensible. Each 4-tuple's entries are the full names of a - language, a script, a country (strictly territory) and a + language, a script, a territory (usually a country) and a variant (currently ignored).""" skips = [] for got, use in self.root.likelySubTags(): @@ -79,7 +55,7 @@ class CldrReader (object): and e.message.startswith('Unknown ') and ' code ' in e.message): skips.append(use) else: - self.grumble('Skipping likelySubtag "{}" -> "{}" ({})\n'.format(got, use, e.message)) + self.grumble(f'Skipping likelySubtag "{got}" -> "{use}" ({e})\n') continue if all(code.startswith('Any') and code[3].isupper() for code in have[:-1]): continue @@ -87,7 +63,7 @@ class CldrReader (object): give = (give[0], # Substitute according to http://www.unicode.org/reports/tr35/#Likely_Subtags have[1] if give[1] == 'AnyScript' else give[1], - have[2] if give[2] == 'AnyCountry' else give[2], + have[2] if give[2] == 'AnyTerritory' else give[2], give[3]) # AnyVariant similarly ? yield have, give @@ -98,51 +74,121 @@ class CldrReader (object): # more out. pass # self.__wrapped(self.whitter, 'Skipping likelySubtags (for unknown codes): ', skips) + def zoneData(self): + """Locale-independent timezone data. + + Returns a triple (alias, defaults, winIds) in which: + * alias is a mapping from aliases for IANA zone IDs, that + have the form of IANA IDs, to actual current IANA IDs; in + particular, this maps each CLDR zone ID to its + corresponding IANA ID. + * defaults maps each Windows name for a zone to the IANA ID + to use for it by default (when no territory is specified, + or when no entry in winIds matches the given Windows name + and territory). + * winIds is a mapping {(winId, land): ianaList} from Windows + name and territory code to the space-joined list of IANA + IDs associated with the Windows name in the given + territory. + + and reports on any territories found in CLDR timezone data + that are not mentioned in enumdata.territory_map, on any + Windows IDs given in zonedata.windowsIdList that are no longer + covered by the CLDR data.""" + alias, ignored = self.root.bcp47Aliases() + defaults, winIds = self.root.readWindowsTimeZones(alias) + + from zonedata import windowsIdList + winUnused = set(n for n, o in windowsIdList).difference( + set(defaults).union(w for w, t, ids in winIds)) + if winUnused: + joined = "\n\t".join(winUnused) + self.whitter.write( + f'No Windows ID in\n\t{joined}\nis still in use.\n' + 'They could be removed at the next major version.\n') + + # Check for duplicate entries in winIds: + last = ('', '', '') + winDup = {} + for triple in sorted(winIds): + if triple[:2] == last[:2]: + try: + seq = winDup[triple[:2]] + except KeyError: + seq = winDup[triple[:2]] = [] + seq.append(triple[-1]) + if winDup: + joined = '\n\t'.join(f'{t}, {w}: ", ".join(ids)' + for (w, t), ids in winDup.items()) + self.whitter.write( + f'Duplicated (territory, Windows ID) entries:\n\t{joined}\n') + winIds = [trip for trip in winIds if trip[:2] not in winDup] + for (w, t), seq in winDup.items(): + ianalist = [] + for ids in seq: + for iana in ids.split(): + if iana not in ianaList: + ianaList.append(iana) + winIds.append((w, t, ' '.join(ianaList))) + + from enumdata import territory_map + unLand = set(t for w, t, ids in winIds).difference( + v[1] for k, v in territory_map.items()) + if unLand: + self.grumble.write( + 'Unknown territory codes in timezone data: ' + f'{", ".join(unLand)}\n' + 'Skipping Windows zone mappings for these territories\n') + winIds = [(w, t, ids) for w, t, ids in winIds if t not in unLand] + + # Convert list of triples to mapping: + winIds = {(w, t): ids for w, t, ids in winIds} + return alias, defaults, winIds + def readLocales(self, calendars = ('gregorian',)): - locales = tuple(self.__allLocales(calendars)) - return dict(((k.language_id, k.script_id, k.country_id, k.variant_code), - k) for k in locales) + return {(k.language_id, k.script_id, k.territory_id, k.variant_id): k + for k in self.__allLocales(calendars)} def __allLocales(self, calendars): def skip(locale, reason): - return 'Skipping defaultContent locale "{}" ({})\n'.format(locale, reason) + return f'Skipping defaultContent locale "{locale}" ({reason})\n' for locale in self.root.defaultContentLocales: try: - language, script, country, variant = self.__splitLocale(locale) + language, script, territory, variant = self.__splitLocale(locale) except ValueError: self.whitter(skip(locale, 'only language tag')) continue - if not (script or country): + if not (script or territory): self.grumble(skip(locale, 'second tag is neither script nor territory')) continue - if not (language and country): + if not (language and territory): continue try: yield self.__getLocaleData(self.root.locale(locale), calendars, - language, script, country, variant) + language, script, territory, variant) except Error as e: self.grumble(skip(locale, e.message)) for locale in self.root.fileLocales: try: chain = self.root.locale(locale) - language, script, country, variant = chain.tagCodes() + language, script, territory, variant = chain.tagCodes() assert language # TODO: this skip should probably be based on likely - # sub-tags, instead of empty country: if locale has a + # sub-tags, instead of empty territory: if locale has a # likely-subtag expansion, that's what QLocale uses, # and we'll be saving its data for the expanded locale # anyway, so don't need to record it for itself. # See also QLocaleXmlReader.loadLocaleMap's grumble. - if not country: + if not territory: continue - yield self.__getLocaleData(chain, calendars, language, script, country, variant) + yield self.__getLocaleData(chain, calendars, language, script, territory, variant) except Error as e: - self.grumble('Skipping file locale "{}" ({})\n'.format(locale, e.message)) + self.grumble(f'Skipping file locale "{locale}" ({e})\n') import textwrap @staticmethod @@ -153,13 +199,13 @@ class CldrReader (object): def __parseTags(self, locale): tags = self.__splitLocale(locale) - language = tags.next() - script = country = variant = '' + language = next(tags) + script = territory = variant = '' try: - script, country, variant = tags + script, territory, variant = tags except ValueError: pass - return tuple(p[1] for p in self.root.codesToIdName(language, script, country, variant)) + return tuple(p[1] for p in self.root.codesToIdName(language, script, territory, variant)) def __splitLocale(self, name): """Generate (language, script, territory, variant) from a locale name @@ -171,14 +217,18 @@ class CldrReader (object): single tag (i.e. contains no underscores). Always yields 1 or 4 values, never 2 or 3.""" tags = iter(name.split('_')) - yield tags.next() # Language - tag = tags.next() # may raise StopIteration + yield next(tags) # Language + + try: + tag = next(tags) + except StopIteration: + return # Script is always four letters, always capitalised: if len(tag) == 4 and tag[0].isupper() and tag[1:].islower(): yield tag try: - tag = tags.next() + tag = next(tags) except StopIteration: tag = '' else: @@ -188,7 +238,7 @@ class CldrReader (object): if tag and tag.isupper() or tag.isdigit(): yield tag try: - tag = tags.next() + tag = next(tags) except StopIteration: tag = '' else: @@ -201,21 +251,22 @@ class CldrReader (object): else: yield '' - # If nothing is left, StopIteration will avoid the warning: - if not tag: - tag = tags.next() - self.grumble('Ignoring unparsed cruft {} in {}\n'.format('_'.join(tag + tuple(tags)), name)) + rest = [tag] if tag else [] + rest.extend(tags) - def __getLocaleData(self, scan, calendars, language, script, country, variant): - ids, names = zip(*self.root.codesToIdName(language, script, country, variant)) - assert ids[0] > 0 and ids[2] > 0, (language, script, country, variant) + if rest: + self.grumble(f'Ignoring unparsed cruft {"_".join(rest)} in {name}\n') + + def __getLocaleData(self, scan, calendars, language, script, territory, variant): + ids, names = zip(*self.root.codesToIdName(language, script, territory, variant)) + assert ids[0] > 0 and ids[2] > 0, (language, script, territory, variant) locale = Locale( language = names[0], language_code = language, language_id = ids[0], script = names[1], script_code = script, script_id = ids[1], - country = names[2], country_code = country, country_id = ids[2], - variant_code = variant) + territory = names[2], territory_code = territory, territory_id = ids[2], + variant_code = variant, variant_id = ids[3]) - firstDay, weStart, weEnd = self.root.weekData(country) + firstDay, weStart, weEnd = self.root.weekData(territory) assert all(day in ('mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun') for day in (firstDay, weStart, weEnd)) @@ -223,7 +274,7 @@ class CldrReader (object): weekendStart = weStart, weekendEnd = weEnd) - iso, digits, rounding = self.root.currencyData(country) + iso, digits, rounding = self.root.currencyData(territory) locale.update(currencyIsoCode = iso, currencyDigits = int(digits), currencyRounding = int(rounding)) @@ -231,7 +282,7 @@ class CldrReader (object): locale.update(scan.currencyData(iso)) locale.update(scan.numericData(self.root.numberSystem, self.whitter)) locale.update(scan.textPatternData()) - locale.update(scan.endonyms(language, script, country, variant)) + locale.update(scan.endonyms(language, script, territory, variant)) locale.update(scan.unitData()) # byte, kB, MB, GB, ..., KiB, MiB, GiB, ... locale.update(scan.calendarNames(calendars)) # Names of days and months @@ -242,7 +293,7 @@ class CldrReader (object): # the cache. If a process were to instantiate this class with distinct # roots, each cache would be filled by the first to need it ! class CldrAccess (object): - def __init__(self, root): + def __init__(self, root: Path): """Set up a master object for accessing CLDR data. Single parameter, root, is the file-system path to the root of @@ -250,18 +301,18 @@ class CldrAccess (object): contain dtd/, main/ and supplemental/ sub-directories.""" self.root = root - def xml(self, *path): + def xml(self, relative_path: str): """Load a single XML file and return its root element as an XmlScanner. The path is interpreted relative to self.root""" - return XmlScanner(Node(self.__xml(path))) + return XmlScanner(Node(self.__xml(relative_path))) def supplement(self, name): """Loads supplemental data as a Supplement object. The name should be that of a file in common/supplemental/, without path. """ - return Supplement(Node(self.__xml(('common', 'supplemental', name)))) + return Supplement(Node(self.__xml(f'common/supplemental/{name}'))) def locale(self, name): """Loads all data for a locale as a LocaleScanner object. @@ -273,17 +324,18 @@ class CldrAccess (object): inheritance, where relevant.""" return LocaleScanner(name, self.__localeRoots(name), self.__rootLocale) + def englishNaming(self, tag): # see QLocaleXmlWriter.enumData() + return self.__codeMap(tag).get + @property - def fileLocales(self, joinPath = os.path.join, listDirectory = os.listdir, - splitExtension = os.path.splitext): + def fileLocales(self) -> Iterable[str]: """Generator for locale IDs seen in file-names. All *.xml other than root.xml in common/main/ are assumed to identify locales.""" - for name in listDirectory(joinPath(self.root, 'common', 'main')): - stem, ext = splitExtension(name) - if ext == '.xml' and stem != 'root': - yield stem + for path in self.root.joinpath('common/main').glob('*.xml'): + if path.stem != 'root': + yield path.stem @property def defaultContentLocales(self): @@ -304,44 +356,44 @@ class CldrAccess (object): def numberSystem(self, system): """Get a description of a numbering system. - Returns a mapping, with keys u'digits', u'type' and u'id'; the + Returns a mapping, with keys 'digits', 'type' and 'id'; the value for this last is system. Raises KeyError for unknown number system, ldml.Error on failure to load data.""" try: return self.__numberSystems[system] except KeyError: - raise Error('Unsupported number system: {}'.format(system)) + raise Error(f'Unsupported number system: {system}') - def weekData(self, country): + def weekData(self, territory): """Data on the weekly cycle. Returns a triple (W, S, E) of en's short names for week-days; W is the first day of the week, S the start of the week-end - and E the end of the week-end. Where data for a country is + and E the end of the week-end. Where data for a territory is unavailable, the data for CLDR's territory 001 (The World) is used.""" try: - return self.__weekData[country] + return self.__weekData[territory] except KeyError: return self.__weekData['001'] - def currencyData(self, country): - """Returns currency data for the given country code. + def currencyData(self, territory): + """Returns currency data for the given territory code. Return value is a tuple (ISO4217 code, digit count, rounding - mode). If CLDR provides no data for this country, ('', 2, 1) + mode). If CLDR provides no data for this territory, ('', 2, 1) is the default result. """ try: - return self.__currencyData[country] + return self.__currencyData[territory] except KeyError: return '', 2, 1 - def codesToIdName(self, language, script, country, variant = ''): + def codesToIdName(self, language, script, territory, variant = ''): """Maps each code to the appropriate ID and name. Returns a 4-tuple of (ID, name) pairs corresponding to the - language, script, country and variant given. Raises a + language, script, territory and variant given. Raises a suitable error if any of them is unknown, indicating all that are unknown plus suitable names for any that could sensibly be added to enumdata.py to make them known. @@ -353,33 +405,33 @@ class CldrAccess (object): try: return (enum('language')[language], enum('script')[script], - enum('country')[country], + enum('territory')[territory], enum('variant')[variant]) except KeyError: pass - parts, values = [], [language, script, country, variant] - for index, key in enumerate(('language', 'script', 'country', 'variant')): + parts, values = [], [language, script, territory, variant] + for index, key in enumerate(('language', 'script', 'territory', 'variant')): naming, enums = self.__codeMap(key), enum(key) value = values[index] if value not in enums: - text = '{} code {}'.format(key, value) + text = f'{key} code {value}' name = naming.get(value) if name and value != 'POSIX': - text += u' (could add {})'.format(name) + text += f' (could add {name})' parts.append(text) if len(parts) > 1: parts[-1] = 'and ' + parts[-1] - assert parts + else: + assert parts + if parts[0].startswith('variant'): + raise Error(f'No support for {parts[0]}', + language, script, territory, variant) raise Error('Unknown ' + ', '.join(parts), - language, script, country, variant) + language, script, territory, variant) @staticmethod - def __checkEnum(given, proper, scraps, - remap = { u'å': 'a', u'ã': 'a', u'ç': 'c', u'é': 'e', u'í': 'i', u'ü': 'u'}, - prefix = { 'St.': 'Saint', 'U.S.': 'United States' }, - suffixes = ( 'Han', ), - skip = u'\u02bc'): + def __checkEnum(given, proper, scraps): # Each is a { code: full name } mapping for code, name in given.items(): try: right = proper[code] @@ -387,35 +439,23 @@ class CldrAccess (object): # No en.xml name for this code, but supplementalData's # parentLocale may still believe in it: if code not in scraps: - yield name, '[Found no CLDR name for code {}]'.format(code) - continue - if name == right: continue - ok = right.replace('&', 'And') - for k, v in prefix.items(): - if ok.startswith(k + ' '): - ok = v + ok[len(k):] - while '(' in ok: - try: f, t = ok.index('('), ok.index(')') - except ValueError: break - ok = ok[:f].rstrip() + ' ' + ok[t:].lstrip() - if any(name == ok + ' ' + s for s in suffixes): - continue - if ''.join(ch for ch in name.lower() if not ch.isspace()) in ''.join( - remap.get(ch, ch) for ch in ok.lower() if ch.isalpha() and ch not in skip): + yield name, f'[Found no CLDR name for code {code}]' continue - yield name, ok + cleaned = names_clash(right, name) + if cleaned: + yield name, cleaned def checkEnumData(self, grumble): scraps = set() for k in self.__parentLocale.keys(): for f in k.split('_'): scraps.add(f) - from enumdata import language_list, country_list, script_list - language = dict((v, k) for k, v in language_list.values() if not v.isspace()) - country = dict((v, k) for k, v in country_list.values() if v != 'ZZ') - script = dict((v, k) for k, v in script_list.values() if v != 'Zzzz') + from enumdata import language_map, territory_map, script_map + language = {v: k for k, v in language_map.values() if not v.isspace()} + territory = {v: k for k, v in territory_map.values() if v != 'ZZ'} + script = {v: k for k, v in script_map.values() if v != 'Zzzz'} lang = dict(self.__checkEnum(language, self.__codeMap('language'), scraps)) - land = dict(self.__checkEnum(country, self.__codeMap('country'), scraps)) + land = dict(self.__checkEnum(territory, self.__codeMap('territory'), scraps)) text = dict(self.__checkEnum(script, self.__codeMap('script'), scraps)) if lang or land or text: grumble("""\ @@ -424,81 +464,127 @@ enumdata.py (keeping the old name as an alias): """) if lang: grumble('Language:\n\t' - + '\n\t'.join('{} -> {}'.format(k, v) for k, v in lang.items()) + + '\n\t'.join(f'{k} -> {v}' for k, v in lang.items()) + '\n') if land: - grumble('Country:\n\t' - + '\n\t'.join('{} -> {}'.format(k, v) for k, v in land.items()) + grumble('Territory:\n\t' + + '\n\t'.join(f'{k} -> {v}' for k, v in land.items()) + '\n') if text: grumble('Script:\n\t' - + '\n\t'.join('{} -> {}'.format(k, v) for k, v in text.items()) + + '\n\t'.join(f'{k} -> {v}' for k, v in text.items()) + '\n') grumble('\n') - def readWindowsTimeZones(self, lookup): # For use by cldr2qtimezone.py + def bcp47Aliases(self): + """Reads the mapping from CLDR IDs to IANA IDs + + CLDR identifies timezones in various ways but its standard + 'name' for them, here described as a CLDR ID, has the form of + an IANA ID. CLDR IDs are stable across time, where IANA IDs + may be revised over time, for example Asia/Calcutta became + Asia/Kolkata. When a new zone is added to CLDR, it gets the + then-current IANA ID as its CLDR ID; if it is later + superseded, CLDR continues using the old ID, so we need a + mapping from that to current IANA IDs. Helpfully, CLDR + provides information about aliasing among time-zone IDs. + + The file common/bcp47/timezone.xml has keyword/key/type + elements with attributes: + + name -- zone code (ignore) + description -- long name for exemplar location, including + territory + + and some of: + + deprecated -- ignore entry if present (has no alias) + preferred -- only present if deprecated + since -- version at which this entry was added (ignore) + alias -- space-joined sequence of IANA-form IDs; first is CLDR ID + iana -- if present, repeats the alias entry that's the modern IANA ID + + This returns a pair (alias, naming) wherein: alias is a + mapping from IANA-format IDs to actual IANA IDs, that maps + each alias to the contemporary ID used by IANA; and naming is + a mapping from IANA ID to the description it and its aliases + shared in their keyword/key/type entry.""" + # File has the same form as supplements: + root = Supplement(Node(self.__xml('common/bcp47/timezone.xml'))) + + # If we ever need a mapping back to CLDR ID, we can make + # (description, space-joined-list) the naming values. + alias, naming = {}, {} # { alias: iana }, { iana: description } + for item, attrs in root.find('keyword/key/type', exclude=('deprecated',)): + assert 'description' in attrs, item + assert 'alias' in attrs, item + names = attrs['alias'].split() + assert not any(name in alias for name in names), item + # CLDR ID is names[0]; if IANA now uses another name for + # it, this is given as the iana attribute. + ianaid, fullName = attrs.get('iana', names[0]), attrs['description'] + alias.update({name: ianaid for name in names}) + assert not ianaid in naming + naming[ianaid] = fullName + + return alias, naming + + def readWindowsTimeZones(self, alias): """Digest CLDR's MS-Win time-zone name mapping. - MS-Win have their own eccentric names for time-zones. CLDR - helpfully provides a translation to more orthodox names. - - Singe argument, lookup, is a mapping from known MS-Win names - for locales to a unique integer index (starting at 1). - - The XML structure we read has the form: - - <supplementalData> - <windowsZones> - <mapTimezones otherVersion="..." typeVersion="..."> - <!-- (UTC-08:00) Pacific Time (US & Canada) --> - <mapZone other="Pacific Standard Time" territory="001" type="America/Los_Angeles"/> - <mapZone other="Pacific Standard Time" territory="CA" type="America/Vancouver America/Dawson America/Whitehorse"/> - <mapZone other="Pacific Standard Time" territory="US" type="America/Los_Angeles America/Metlakatla"/> - <mapZone other="Pacific Standard Time" territory="ZZ" type="PST8PDT"/> - </mapTimezones> - </windowsZones> - </supplementalData> -""" + Single argument, alias, should be the first part of the pair + returned by a call to bcp47Aliases(); it shall be used to + transform CLDR IDs into IANA IDs. + + MS-Win have their own eccentric names for time-zones. CLDR + helpfully provides a translation to more orthodox names, + albeit these are CLDR IDs - see bcp47Aliases() - rather than + (up to date) IANA IDs. The windowsZones.xml supplement has + supplementalData/windowsZones/mapTimezones/mapZone nodes with + attributes + + territory -- ISO code + type -- space-joined sequence of CLDR IDs of zones + other -- Windows name of these zones in the given territory + + When 'territory' is '001', type is always just a single CLDR + zone ID. This is the default zone for the given Windows name. + + For each mapZone node, its type is split on spacing and + cleaned up as follows. Those entries that are keys of alias + are mapped thereby to their canonical IANA IDs; all others are + presumed to be canonical IANA IDs and left unchanged. Any + later duplicates of earlier entries are omitted. The result + list of IANA IDs is joined with single spaces between to give + a string s. + + Returns a twople (defaults, windows) in which defaults is a + mapping, from Windows ID to IANA ID (derived from the mapZone + nodes with territory='001'), and windows is a list of triples + (Windows ID, territory code, IANA ID list) in which the first + two entries are the 'other' and 'territory' fields of a + mapZone element and the last is s, its cleaned-up list of IANA + IDs.""" + + defaults, windows = {}, [] zones = self.supplement('windowsZones.xml') - enum = self.__enumMap('country') - badZones, unLands, defaults, windows = set(), set(), {}, {} - for name, attrs in zones.find('windowsZones/mapTimezones'): if name != 'mapZone': continue - wid, code = attrs['other'], attrs['territory'] - data = dict(windowsId = wid, - countryCode = code, - ianaList = attrs['type']) - - try: - key = lookup[wid] - except KeyError: - badZones.add(wid) - key = 0 - data['windowsKey'] = key + wid, code, ianas = attrs['other'], attrs['territory'], [] + for cldr in attrs['type'].split(): + iana = alias.get(cldr, cldr) + if iana not in ianas: + ianas.append(iana) - if code == u'001': - defaults[key] = data['ianaList'] + if code == '001': + assert len(ianas) == 1, (wid, *ianas) + defaults[wid] = ianas[0] else: - try: - cid, name = enum[code] - except KeyError: - unLands.append(code) - continue - data.update(countryId = cid, country = name) - windows[key, cid] = data - - if unLands: - raise Error('Unknown country codes, please add to enumdata.py: ' - + ', '.join(sorted(unLands))) - - if badZones: - raise Error('Unknown Windows IDs, please add to cldr2qtimezone.py: ' - + ', '.join(sorted(badZones))) + windows.append((wid, code, ' '.join(ianas))) - return self.cldrVersion, defaults, windows + return defaults, windows @property def cldrVersion(self): @@ -507,20 +593,20 @@ enumdata.py (keeping the old name as an alias): return self.__cldrVersion # Implementation details - def __xml(self, path, cache = CacheDict(), read = minidom.parse, joinPath = os.path.join): + def __xml(self, relative_path: str, cache = CacheDict(), read = minidom.parse): try: - doc = cache[path] + doc = cache[relative_path] except KeyError: - cache[path] = doc = read(joinPath(self.root, *path)).documentElement + cache[relative_path] = doc = read(str(self.root.joinpath(relative_path))).documentElement return doc - def __open(self, path, joinPath=os.path.join): - return open(joinPath(self.root, *path)) + def __open(self, relative_path: str) -> TextIO: + return self.root.joinpath(relative_path).open() @property def __rootLocale(self, cache = []): if not cache: - cache.append(self.xml('common', 'main', 'root.xml')) + cache.append(self.xml('common/main/root.xml')) return cache[0] @property @@ -530,7 +616,7 @@ enumdata.py (keeping the old name as an alias): return cache[0] @property - def __numberSystems(self, cache = {}, joinPath=os.path.join): + def __numberSystems(self, cache = {}): if not cache: for ignore, attrs in self.supplement('numberingSystems.xml').find('numberingSystems'): cache[attrs['id']] = attrs @@ -563,7 +649,7 @@ enumdata.py (keeping the old name as an alias): source = self.__supplementalData for key in ('firstDay', 'weekendStart', 'weekendEnd'): result = {} - for ignore, attrs in source.find('weekData/' + key): + for ignore, attrs in source.find(f'weekData/{key}'): assert ignore == key day = attrs['day'] assert day in ('mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun'), day @@ -579,8 +665,10 @@ enumdata.py (keeping the old name as an alias): source = self.__supplementalData for elt in source.findNodes('currencyData/region'): iso, digits, rounding = '', 2, 1 + # TODO: fractions/info[iso4217=DEFAULT] has rounding=0 - why do we differ ? + # Also: some fractions/info have cashDigits and cashRounding - should we use them ? try: - country = elt.dom.attributes['iso3166'].nodeValue + territory = elt.dom.attributes['iso3166'].nodeValue except KeyError: continue for child in elt.findAllChildren('currency'): @@ -596,16 +684,16 @@ enumdata.py (keeping the old name as an alias): break if iso: for tag, data in source.find( - 'currencyData/fractions/info[iso4217={}]'.format(iso)): + f'currencyData/fractions/info[iso4217={iso}]'): digits = data['digits'] rounding = data['rounding'] - cache[country] = iso, digits, rounding + cache[territory] = iso, digits, rounding assert cache return cache @property - def __unDistinguishedAttributes(self, cache = {}, joinPath = os.path.join): + def __unDistinguishedAttributes(self, cache = {}): """Mapping from tag names to lists of attributes. LDML defines some attributes as 'distinguishing': if a node @@ -625,7 +713,7 @@ enumdata.py (keeping the old name as an alias): return cache - def __scanLdmlDtd(self, joinPath = os.path.join): + def __scanLdmlDtd(self): """Scan the LDML DTD, record CLDR version Yields (tag, attrs) pairs: on elements with a given tag, @@ -635,7 +723,7 @@ enumdata.py (keeping the old name as an alias): Sets self.__cldrVersion as a side-effect, since this information is found in the same file.""" - with self.__open(('common', 'dtd', 'ldml.dtd')) as dtd: + with self.__open('common/dtd/ldml.dtd') as dtd: tag, ignored, last = None, None, None for line in dtd: @@ -670,15 +758,15 @@ enumdata.py (keeping the old name as an alias): def __enumMap(self, key, cache = {}): if not cache: cache['variant'] = {'': (0, 'This should never be seen outside ldml.py')} - # They're not actually lists: mappings from numeric value - # to pairs of full name and short code. What we want, in - # each case, is a mapping from code to the other two. - from enumdata import language_list, script_list, country_list - for form, book, empty in (('language', language_list, 'AnyLanguage'), - ('script', script_list, 'AnyScript'), - ('country', country_list, 'AnyCountry')): - cache[form] = dict((pair[1], (num, pair[0])) - for num, pair in book.items() if pair[0] != 'C') + # They're mappings from numeric value to pairs of full + # name and short code. What we want, in each case, is a + # mapping from code to the other two. + from enumdata import language_map, script_map, territory_map + for form, book, empty in (('language', language_map, 'AnyLanguage'), + ('script', script_map, 'AnyScript'), + ('territory', territory_map, 'AnyTerritory')): + cache[form] = {pair[1]: (num, pair[0]) + for num, pair in book.items() if pair[0] != 'C'} # (Have to filter out the C locale, as we give it the # same (all space) code as AnyLanguage, whose code # should probably be 'und' instead.) @@ -693,9 +781,9 @@ enumdata.py (keeping the old name as an alias): def __codeMap(self, key, cache = {}, # Maps our name for it to CLDR's name: naming = {'language': 'languages', 'script': 'scripts', - 'country': 'territories', 'variant': 'variants'}): + 'territory': 'territories', 'variant': 'variants'}): if not cache: - root = self.xml('common', 'main', 'en.xml').root.findUniqueChild('localeDisplayNames') + root = self.xml('common/main/en.xml').root.findUniqueChild('localeDisplayNames') for dst, src in naming.items(): cache[dst] = dict(self.__codeMapScan(root.findUniqueChild(src))) assert cache @@ -721,7 +809,13 @@ enumdata.py (keeping the old name as an alias): except (KeyError, ValueError, TypeError): pass else: - if key not in seen or not elt.attributes.has_key('alt'): + # Prefer stand-alone forms of names when present, ignore other + # alt="..." entries. For example, Traditional and Simplified + # Han omit "Han" in the plain form, but include it for + # stand-alone. As the stand-alone version appears later, it + # over-writes the plain one. + if (key not in seen or 'alt' not in elt.attributes + or elt.attributes['alt'].nodeValue == 'stand-alone'): yield key, value seen.add(key) @@ -730,7 +824,8 @@ enumdata.py (keeping the old name as an alias): def __parentLocale(self, cache = {}): # see http://www.unicode.org/reports/tr35/#Parent_Locales if not cache: - for tag, attrs in self.__supplementalData.find('parentLocales'): + for tag, attrs in self.__supplementalData.find('parentLocales', + ('component',)): parent = attrs.get('parent', '') for child in attrs['locales'].split(): cache[child] = parent @@ -738,10 +833,9 @@ enumdata.py (keeping the old name as an alias): return cache - def __localeAsDoc(self, name, aliasFor = None, - joinPath = os.path.join, exists = os.path.isfile): - path = ('common', 'main', name + '.xml') - if exists(joinPath(self.root, *path)): + def __localeAsDoc(self, name: str, aliasFor = None): + path = f'common/main/{name}.xml' + if self.root.joinpath(path).exists(): elt = self.__xml(path) for child in Node(elt).findAllChildren('alias'): try: @@ -754,8 +848,8 @@ enumdata.py (keeping the old name as an alias): return elt if aliasFor: - raise Error('Fatal error: found an alias "{}" -> "{}", but found no file for the alias' - .format(aliasFor, name)) + raise Error(f'Fatal error: found an alias "{aliasFor}" -> "{name}", ' + 'but found no file for the alias') def __scanLocaleRoots(self, name): while name and name != 'root': @@ -780,4 +874,4 @@ enumdata.py (keeping the old name as an alias): return chain # Unpolute the namespace: we don't need to export these. -del minidom, CacheDict, os +del minidom, CacheDict |