summaryrefslogtreecommitdiffstats
path: root/util/locale_database/cldr.py
diff options
context:
space:
mode:
Diffstat (limited to 'util/locale_database/cldr.py')
-rw-r--r--util/locale_database/cldr.py516
1 files changed, 305 insertions, 211 deletions
diff --git a/util/locale_database/cldr.py b/util/locale_database/cldr.py
index f2b6616fce..75d687dd11 100644
--- a/util/locale_database/cldr.py
+++ b/util/locale_database/cldr.py
@@ -1,31 +1,5 @@
-# -*- coding: utf-8; -*-
-#############################################################################
-##
-## Copyright (C) 2020 The Qt Company Ltd.
-## Contact: https://www.qt.io/licensing/
-##
-## This file is part of the test suite of the Qt Toolkit.
-##
-## $QT_BEGIN_LICENSE:GPL-EXCEPT$
-## Commercial License Usage
-## Licensees holding valid commercial Qt licenses may use this file in
-## accordance with the commercial license agreement provided with the
-## Software or, alternatively, in accordance with the terms contained in
-## a written agreement between you and The Qt Company. For licensing terms
-## and conditions see https://www.qt.io/terms-conditions. For further
-## information use the contact form at https://www.qt.io/contact-us.
-##
-## GNU General Public License Usage
-## Alternatively, this file may be used under the terms of the GNU
-## General Public License version 3 as published by the Free Software
-## Foundation with exceptions as appearing in the file LICENSE.GPL3-EXCEPT
-## included in the packaging of this file. Please review the following
-## information to ensure the GNU General Public License requirements will
-## be met: https://www.gnu.org/licenses/gpl-3.0.html.
-##
-## $QT_END_LICENSE$
-##
-#############################################################################
+# Copyright (C) 2021 The Qt Company Ltd.
+# SPDX-License-Identifier: LicenseRef-Qt-Commercial OR GPL-3.0-only WITH Qt-GPL-exception-1.0
"""Digesting the CLDR's data.
Provides two classes:
@@ -36,15 +10,17 @@ The former should normally be all you need to access.
See individual classes for further detail.
"""
+from typing import Iterable, TextIO
from xml.dom import minidom
from weakref import WeakValueDictionary as CacheDict
-import os
+from pathlib import Path
from ldml import Error, Node, XmlScanner, Supplement, LocaleScanner
+from localetools import names_clash
from qlocalexml import Locale
class CldrReader (object):
- def __init__(self, root, grumble = lambda msg: None, whitter = lambda msg: None):
+ def __init__(self, root: Path, grumble = lambda msg: None, whitter = lambda msg: None):
"""Set up a reader object for reading CLDR data.
Single parameter, root, is the file-system path to the root of
@@ -67,7 +43,7 @@ class CldrReader (object):
Yields pairs (have, give) of 4-tuples; if what you have
matches the left member, giving the right member is probably
sensible. Each 4-tuple's entries are the full names of a
- language, a script, a country (strictly territory) and a
+ language, a script, a territory (usually a country) and a
variant (currently ignored)."""
skips = []
for got, use in self.root.likelySubTags():
@@ -79,7 +55,7 @@ class CldrReader (object):
and e.message.startswith('Unknown ') and ' code ' in e.message):
skips.append(use)
else:
- self.grumble('Skipping likelySubtag "{}" -> "{}" ({})\n'.format(got, use, e.message))
+ self.grumble(f'Skipping likelySubtag "{got}" -> "{use}" ({e})\n')
continue
if all(code.startswith('Any') and code[3].isupper() for code in have[:-1]):
continue
@@ -87,7 +63,7 @@ class CldrReader (object):
give = (give[0],
# Substitute according to http://www.unicode.org/reports/tr35/#Likely_Subtags
have[1] if give[1] == 'AnyScript' else give[1],
- have[2] if give[2] == 'AnyCountry' else give[2],
+ have[2] if give[2] == 'AnyTerritory' else give[2],
give[3]) # AnyVariant similarly ?
yield have, give
@@ -98,51 +74,121 @@ class CldrReader (object):
# more out.
pass # self.__wrapped(self.whitter, 'Skipping likelySubtags (for unknown codes): ', skips)
+ def zoneData(self):
+ """Locale-independent timezone data.
+
+ Returns a triple (alias, defaults, winIds) in which:
+ * alias is a mapping from aliases for IANA zone IDs, that
+ have the form of IANA IDs, to actual current IANA IDs; in
+ particular, this maps each CLDR zone ID to its
+ corresponding IANA ID.
+ * defaults maps each Windows name for a zone to the IANA ID
+ to use for it by default (when no territory is specified,
+ or when no entry in winIds matches the given Windows name
+ and territory).
+ * winIds is a mapping {(winId, land): ianaList} from Windows
+ name and territory code to the space-joined list of IANA
+ IDs associated with the Windows name in the given
+ territory.
+
+ and reports on any territories found in CLDR timezone data
+ that are not mentioned in enumdata.territory_map, on any
+ Windows IDs given in zonedata.windowsIdList that are no longer
+ covered by the CLDR data."""
+ alias, ignored = self.root.bcp47Aliases()
+ defaults, winIds = self.root.readWindowsTimeZones(alias)
+
+ from zonedata import windowsIdList
+ winUnused = set(n for n, o in windowsIdList).difference(
+ set(defaults).union(w for w, t, ids in winIds))
+ if winUnused:
+ joined = "\n\t".join(winUnused)
+ self.whitter.write(
+ f'No Windows ID in\n\t{joined}\nis still in use.\n'
+ 'They could be removed at the next major version.\n')
+
+ # Check for duplicate entries in winIds:
+ last = ('', '', '')
+ winDup = {}
+ for triple in sorted(winIds):
+ if triple[:2] == last[:2]:
+ try:
+ seq = winDup[triple[:2]]
+ except KeyError:
+ seq = winDup[triple[:2]] = []
+ seq.append(triple[-1])
+ if winDup:
+ joined = '\n\t'.join(f'{t}, {w}: ", ".join(ids)'
+ for (w, t), ids in winDup.items())
+ self.whitter.write(
+ f'Duplicated (territory, Windows ID) entries:\n\t{joined}\n')
+ winIds = [trip for trip in winIds if trip[:2] not in winDup]
+ for (w, t), seq in winDup.items():
+ ianalist = []
+ for ids in seq:
+ for iana in ids.split():
+ if iana not in ianaList:
+ ianaList.append(iana)
+ winIds.append((w, t, ' '.join(ianaList)))
+
+ from enumdata import territory_map
+ unLand = set(t for w, t, ids in winIds).difference(
+ v[1] for k, v in territory_map.items())
+ if unLand:
+ self.grumble.write(
+ 'Unknown territory codes in timezone data: '
+ f'{", ".join(unLand)}\n'
+ 'Skipping Windows zone mappings for these territories\n')
+ winIds = [(w, t, ids) for w, t, ids in winIds if t not in unLand]
+
+ # Convert list of triples to mapping:
+ winIds = {(w, t): ids for w, t, ids in winIds}
+ return alias, defaults, winIds
+
def readLocales(self, calendars = ('gregorian',)):
- locales = tuple(self.__allLocales(calendars))
- return dict(((k.language_id, k.script_id, k.country_id, k.variant_code),
- k) for k in locales)
+ return {(k.language_id, k.script_id, k.territory_id, k.variant_id): k
+ for k in self.__allLocales(calendars)}
def __allLocales(self, calendars):
def skip(locale, reason):
- return 'Skipping defaultContent locale "{}" ({})\n'.format(locale, reason)
+ return f'Skipping defaultContent locale "{locale}" ({reason})\n'
for locale in self.root.defaultContentLocales:
try:
- language, script, country, variant = self.__splitLocale(locale)
+ language, script, territory, variant = self.__splitLocale(locale)
except ValueError:
self.whitter(skip(locale, 'only language tag'))
continue
- if not (script or country):
+ if not (script or territory):
self.grumble(skip(locale, 'second tag is neither script nor territory'))
continue
- if not (language and country):
+ if not (language and territory):
continue
try:
yield self.__getLocaleData(self.root.locale(locale), calendars,
- language, script, country, variant)
+ language, script, territory, variant)
except Error as e:
self.grumble(skip(locale, e.message))
for locale in self.root.fileLocales:
try:
chain = self.root.locale(locale)
- language, script, country, variant = chain.tagCodes()
+ language, script, territory, variant = chain.tagCodes()
assert language
# TODO: this skip should probably be based on likely
- # sub-tags, instead of empty country: if locale has a
+ # sub-tags, instead of empty territory: if locale has a
# likely-subtag expansion, that's what QLocale uses,
# and we'll be saving its data for the expanded locale
# anyway, so don't need to record it for itself.
# See also QLocaleXmlReader.loadLocaleMap's grumble.
- if not country:
+ if not territory:
continue
- yield self.__getLocaleData(chain, calendars, language, script, country, variant)
+ yield self.__getLocaleData(chain, calendars, language, script, territory, variant)
except Error as e:
- self.grumble('Skipping file locale "{}" ({})\n'.format(locale, e.message))
+ self.grumble(f'Skipping file locale "{locale}" ({e})\n')
import textwrap
@staticmethod
@@ -153,13 +199,13 @@ class CldrReader (object):
def __parseTags(self, locale):
tags = self.__splitLocale(locale)
- language = tags.next()
- script = country = variant = ''
+ language = next(tags)
+ script = territory = variant = ''
try:
- script, country, variant = tags
+ script, territory, variant = tags
except ValueError:
pass
- return tuple(p[1] for p in self.root.codesToIdName(language, script, country, variant))
+ return tuple(p[1] for p in self.root.codesToIdName(language, script, territory, variant))
def __splitLocale(self, name):
"""Generate (language, script, territory, variant) from a locale name
@@ -171,14 +217,18 @@ class CldrReader (object):
single tag (i.e. contains no underscores). Always yields 1 or
4 values, never 2 or 3."""
tags = iter(name.split('_'))
- yield tags.next() # Language
- tag = tags.next() # may raise StopIteration
+ yield next(tags) # Language
+
+ try:
+ tag = next(tags)
+ except StopIteration:
+ return
# Script is always four letters, always capitalised:
if len(tag) == 4 and tag[0].isupper() and tag[1:].islower():
yield tag
try:
- tag = tags.next()
+ tag = next(tags)
except StopIteration:
tag = ''
else:
@@ -188,7 +238,7 @@ class CldrReader (object):
if tag and tag.isupper() or tag.isdigit():
yield tag
try:
- tag = tags.next()
+ tag = next(tags)
except StopIteration:
tag = ''
else:
@@ -201,21 +251,22 @@ class CldrReader (object):
else:
yield ''
- # If nothing is left, StopIteration will avoid the warning:
- if not tag:
- tag = tags.next()
- self.grumble('Ignoring unparsed cruft {} in {}\n'.format('_'.join(tag + tuple(tags)), name))
+ rest = [tag] if tag else []
+ rest.extend(tags)
- def __getLocaleData(self, scan, calendars, language, script, country, variant):
- ids, names = zip(*self.root.codesToIdName(language, script, country, variant))
- assert ids[0] > 0 and ids[2] > 0, (language, script, country, variant)
+ if rest:
+ self.grumble(f'Ignoring unparsed cruft {"_".join(rest)} in {name}\n')
+
+ def __getLocaleData(self, scan, calendars, language, script, territory, variant):
+ ids, names = zip(*self.root.codesToIdName(language, script, territory, variant))
+ assert ids[0] > 0 and ids[2] > 0, (language, script, territory, variant)
locale = Locale(
language = names[0], language_code = language, language_id = ids[0],
script = names[1], script_code = script, script_id = ids[1],
- country = names[2], country_code = country, country_id = ids[2],
- variant_code = variant)
+ territory = names[2], territory_code = territory, territory_id = ids[2],
+ variant_code = variant, variant_id = ids[3])
- firstDay, weStart, weEnd = self.root.weekData(country)
+ firstDay, weStart, weEnd = self.root.weekData(territory)
assert all(day in ('mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun')
for day in (firstDay, weStart, weEnd))
@@ -223,7 +274,7 @@ class CldrReader (object):
weekendStart = weStart,
weekendEnd = weEnd)
- iso, digits, rounding = self.root.currencyData(country)
+ iso, digits, rounding = self.root.currencyData(territory)
locale.update(currencyIsoCode = iso,
currencyDigits = int(digits),
currencyRounding = int(rounding))
@@ -231,7 +282,7 @@ class CldrReader (object):
locale.update(scan.currencyData(iso))
locale.update(scan.numericData(self.root.numberSystem, self.whitter))
locale.update(scan.textPatternData())
- locale.update(scan.endonyms(language, script, country, variant))
+ locale.update(scan.endonyms(language, script, territory, variant))
locale.update(scan.unitData()) # byte, kB, MB, GB, ..., KiB, MiB, GiB, ...
locale.update(scan.calendarNames(calendars)) # Names of days and months
@@ -242,7 +293,7 @@ class CldrReader (object):
# the cache. If a process were to instantiate this class with distinct
# roots, each cache would be filled by the first to need it !
class CldrAccess (object):
- def __init__(self, root):
+ def __init__(self, root: Path):
"""Set up a master object for accessing CLDR data.
Single parameter, root, is the file-system path to the root of
@@ -250,18 +301,18 @@ class CldrAccess (object):
contain dtd/, main/ and supplemental/ sub-directories."""
self.root = root
- def xml(self, *path):
+ def xml(self, relative_path: str):
"""Load a single XML file and return its root element as an XmlScanner.
The path is interpreted relative to self.root"""
- return XmlScanner(Node(self.__xml(path)))
+ return XmlScanner(Node(self.__xml(relative_path)))
def supplement(self, name):
"""Loads supplemental data as a Supplement object.
The name should be that of a file in common/supplemental/, without path.
"""
- return Supplement(Node(self.__xml(('common', 'supplemental', name))))
+ return Supplement(Node(self.__xml(f'common/supplemental/{name}')))
def locale(self, name):
"""Loads all data for a locale as a LocaleScanner object.
@@ -273,17 +324,18 @@ class CldrAccess (object):
inheritance, where relevant."""
return LocaleScanner(name, self.__localeRoots(name), self.__rootLocale)
+ def englishNaming(self, tag): # see QLocaleXmlWriter.enumData()
+ return self.__codeMap(tag).get
+
@property
- def fileLocales(self, joinPath = os.path.join, listDirectory = os.listdir,
- splitExtension = os.path.splitext):
+ def fileLocales(self) -> Iterable[str]:
"""Generator for locale IDs seen in file-names.
All *.xml other than root.xml in common/main/ are assumed to
identify locales."""
- for name in listDirectory(joinPath(self.root, 'common', 'main')):
- stem, ext = splitExtension(name)
- if ext == '.xml' and stem != 'root':
- yield stem
+ for path in self.root.joinpath('common/main').glob('*.xml'):
+ if path.stem != 'root':
+ yield path.stem
@property
def defaultContentLocales(self):
@@ -304,44 +356,44 @@ class CldrAccess (object):
def numberSystem(self, system):
"""Get a description of a numbering system.
- Returns a mapping, with keys u'digits', u'type' and u'id'; the
+ Returns a mapping, with keys 'digits', 'type' and 'id'; the
value for this last is system. Raises KeyError for unknown
number system, ldml.Error on failure to load data."""
try:
return self.__numberSystems[system]
except KeyError:
- raise Error('Unsupported number system: {}'.format(system))
+ raise Error(f'Unsupported number system: {system}')
- def weekData(self, country):
+ def weekData(self, territory):
"""Data on the weekly cycle.
Returns a triple (W, S, E) of en's short names for week-days;
W is the first day of the week, S the start of the week-end
- and E the end of the week-end. Where data for a country is
+ and E the end of the week-end. Where data for a territory is
unavailable, the data for CLDR's territory 001 (The World) is
used."""
try:
- return self.__weekData[country]
+ return self.__weekData[territory]
except KeyError:
return self.__weekData['001']
- def currencyData(self, country):
- """Returns currency data for the given country code.
+ def currencyData(self, territory):
+ """Returns currency data for the given territory code.
Return value is a tuple (ISO4217 code, digit count, rounding
- mode). If CLDR provides no data for this country, ('', 2, 1)
+ mode). If CLDR provides no data for this territory, ('', 2, 1)
is the default result.
"""
try:
- return self.__currencyData[country]
+ return self.__currencyData[territory]
except KeyError:
return '', 2, 1
- def codesToIdName(self, language, script, country, variant = ''):
+ def codesToIdName(self, language, script, territory, variant = ''):
"""Maps each code to the appropriate ID and name.
Returns a 4-tuple of (ID, name) pairs corresponding to the
- language, script, country and variant given. Raises a
+ language, script, territory and variant given. Raises a
suitable error if any of them is unknown, indicating all that
are unknown plus suitable names for any that could sensibly be
added to enumdata.py to make them known.
@@ -353,33 +405,33 @@ class CldrAccess (object):
try:
return (enum('language')[language],
enum('script')[script],
- enum('country')[country],
+ enum('territory')[territory],
enum('variant')[variant])
except KeyError:
pass
- parts, values = [], [language, script, country, variant]
- for index, key in enumerate(('language', 'script', 'country', 'variant')):
+ parts, values = [], [language, script, territory, variant]
+ for index, key in enumerate(('language', 'script', 'territory', 'variant')):
naming, enums = self.__codeMap(key), enum(key)
value = values[index]
if value not in enums:
- text = '{} code {}'.format(key, value)
+ text = f'{key} code {value}'
name = naming.get(value)
if name and value != 'POSIX':
- text += u' (could add {})'.format(name)
+ text += f' (could add {name})'
parts.append(text)
if len(parts) > 1:
parts[-1] = 'and ' + parts[-1]
- assert parts
+ else:
+ assert parts
+ if parts[0].startswith('variant'):
+ raise Error(f'No support for {parts[0]}',
+ language, script, territory, variant)
raise Error('Unknown ' + ', '.join(parts),
- language, script, country, variant)
+ language, script, territory, variant)
@staticmethod
- def __checkEnum(given, proper, scraps,
- remap = { u'å': 'a', u'ã': 'a', u'ç': 'c', u'é': 'e', u'í': 'i', u'ü': 'u'},
- prefix = { 'St.': 'Saint', 'U.S.': 'United States' },
- suffixes = ( 'Han', ),
- skip = u'\u02bc'):
+ def __checkEnum(given, proper, scraps):
# Each is a { code: full name } mapping
for code, name in given.items():
try: right = proper[code]
@@ -387,35 +439,23 @@ class CldrAccess (object):
# No en.xml name for this code, but supplementalData's
# parentLocale may still believe in it:
if code not in scraps:
- yield name, '[Found no CLDR name for code {}]'.format(code)
- continue
- if name == right: continue
- ok = right.replace('&', 'And')
- for k, v in prefix.items():
- if ok.startswith(k + ' '):
- ok = v + ok[len(k):]
- while '(' in ok:
- try: f, t = ok.index('('), ok.index(')')
- except ValueError: break
- ok = ok[:f].rstrip() + ' ' + ok[t:].lstrip()
- if any(name == ok + ' ' + s for s in suffixes):
- continue
- if ''.join(ch for ch in name.lower() if not ch.isspace()) in ''.join(
- remap.get(ch, ch) for ch in ok.lower() if ch.isalpha() and ch not in skip):
+ yield name, f'[Found no CLDR name for code {code}]'
continue
- yield name, ok
+ cleaned = names_clash(right, name)
+ if cleaned:
+ yield name, cleaned
def checkEnumData(self, grumble):
scraps = set()
for k in self.__parentLocale.keys():
for f in k.split('_'):
scraps.add(f)
- from enumdata import language_list, country_list, script_list
- language = dict((v, k) for k, v in language_list.values() if not v.isspace())
- country = dict((v, k) for k, v in country_list.values() if v != 'ZZ')
- script = dict((v, k) for k, v in script_list.values() if v != 'Zzzz')
+ from enumdata import language_map, territory_map, script_map
+ language = {v: k for k, v in language_map.values() if not v.isspace()}
+ territory = {v: k for k, v in territory_map.values() if v != 'ZZ'}
+ script = {v: k for k, v in script_map.values() if v != 'Zzzz'}
lang = dict(self.__checkEnum(language, self.__codeMap('language'), scraps))
- land = dict(self.__checkEnum(country, self.__codeMap('country'), scraps))
+ land = dict(self.__checkEnum(territory, self.__codeMap('territory'), scraps))
text = dict(self.__checkEnum(script, self.__codeMap('script'), scraps))
if lang or land or text:
grumble("""\
@@ -424,81 +464,127 @@ enumdata.py (keeping the old name as an alias):
""")
if lang:
grumble('Language:\n\t'
- + '\n\t'.join('{} -> {}'.format(k, v) for k, v in lang.items())
+ + '\n\t'.join(f'{k} -> {v}' for k, v in lang.items())
+ '\n')
if land:
- grumble('Country:\n\t'
- + '\n\t'.join('{} -> {}'.format(k, v) for k, v in land.items())
+ grumble('Territory:\n\t'
+ + '\n\t'.join(f'{k} -> {v}' for k, v in land.items())
+ '\n')
if text:
grumble('Script:\n\t'
- + '\n\t'.join('{} -> {}'.format(k, v) for k, v in text.items())
+ + '\n\t'.join(f'{k} -> {v}' for k, v in text.items())
+ '\n')
grumble('\n')
- def readWindowsTimeZones(self, lookup): # For use by cldr2qtimezone.py
+ def bcp47Aliases(self):
+ """Reads the mapping from CLDR IDs to IANA IDs
+
+ CLDR identifies timezones in various ways but its standard
+ 'name' for them, here described as a CLDR ID, has the form of
+ an IANA ID. CLDR IDs are stable across time, where IANA IDs
+ may be revised over time, for example Asia/Calcutta became
+ Asia/Kolkata. When a new zone is added to CLDR, it gets the
+ then-current IANA ID as its CLDR ID; if it is later
+ superseded, CLDR continues using the old ID, so we need a
+ mapping from that to current IANA IDs. Helpfully, CLDR
+ provides information about aliasing among time-zone IDs.
+
+ The file common/bcp47/timezone.xml has keyword/key/type
+ elements with attributes:
+
+ name -- zone code (ignore)
+ description -- long name for exemplar location, including
+ territory
+
+ and some of:
+
+ deprecated -- ignore entry if present (has no alias)
+ preferred -- only present if deprecated
+ since -- version at which this entry was added (ignore)
+ alias -- space-joined sequence of IANA-form IDs; first is CLDR ID
+ iana -- if present, repeats the alias entry that's the modern IANA ID
+
+ This returns a pair (alias, naming) wherein: alias is a
+ mapping from IANA-format IDs to actual IANA IDs, that maps
+ each alias to the contemporary ID used by IANA; and naming is
+ a mapping from IANA ID to the description it and its aliases
+ shared in their keyword/key/type entry."""
+ # File has the same form as supplements:
+ root = Supplement(Node(self.__xml('common/bcp47/timezone.xml')))
+
+ # If we ever need a mapping back to CLDR ID, we can make
+ # (description, space-joined-list) the naming values.
+ alias, naming = {}, {} # { alias: iana }, { iana: description }
+ for item, attrs in root.find('keyword/key/type', exclude=('deprecated',)):
+ assert 'description' in attrs, item
+ assert 'alias' in attrs, item
+ names = attrs['alias'].split()
+ assert not any(name in alias for name in names), item
+ # CLDR ID is names[0]; if IANA now uses another name for
+ # it, this is given as the iana attribute.
+ ianaid, fullName = attrs.get('iana', names[0]), attrs['description']
+ alias.update({name: ianaid for name in names})
+ assert not ianaid in naming
+ naming[ianaid] = fullName
+
+ return alias, naming
+
+ def readWindowsTimeZones(self, alias):
"""Digest CLDR's MS-Win time-zone name mapping.
- MS-Win have their own eccentric names for time-zones. CLDR
- helpfully provides a translation to more orthodox names.
-
- Singe argument, lookup, is a mapping from known MS-Win names
- for locales to a unique integer index (starting at 1).
-
- The XML structure we read has the form:
-
- <supplementalData>
- <windowsZones>
- <mapTimezones otherVersion="..." typeVersion="...">
- <!-- (UTC-08:00) Pacific Time (US & Canada) -->
- <mapZone other="Pacific Standard Time" territory="001" type="America/Los_Angeles"/>
- <mapZone other="Pacific Standard Time" territory="CA" type="America/Vancouver America/Dawson America/Whitehorse"/>
- <mapZone other="Pacific Standard Time" territory="US" type="America/Los_Angeles America/Metlakatla"/>
- <mapZone other="Pacific Standard Time" territory="ZZ" type="PST8PDT"/>
- </mapTimezones>
- </windowsZones>
- </supplementalData>
-"""
+ Single argument, alias, should be the first part of the pair
+ returned by a call to bcp47Aliases(); it shall be used to
+ transform CLDR IDs into IANA IDs.
+
+ MS-Win have their own eccentric names for time-zones. CLDR
+ helpfully provides a translation to more orthodox names,
+ albeit these are CLDR IDs - see bcp47Aliases() - rather than
+ (up to date) IANA IDs. The windowsZones.xml supplement has
+ supplementalData/windowsZones/mapTimezones/mapZone nodes with
+ attributes
+
+ territory -- ISO code
+ type -- space-joined sequence of CLDR IDs of zones
+ other -- Windows name of these zones in the given territory
+
+ When 'territory' is '001', type is always just a single CLDR
+ zone ID. This is the default zone for the given Windows name.
+
+ For each mapZone node, its type is split on spacing and
+ cleaned up as follows. Those entries that are keys of alias
+ are mapped thereby to their canonical IANA IDs; all others are
+ presumed to be canonical IANA IDs and left unchanged. Any
+ later duplicates of earlier entries are omitted. The result
+ list of IANA IDs is joined with single spaces between to give
+ a string s.
+
+ Returns a twople (defaults, windows) in which defaults is a
+ mapping, from Windows ID to IANA ID (derived from the mapZone
+ nodes with territory='001'), and windows is a list of triples
+ (Windows ID, territory code, IANA ID list) in which the first
+ two entries are the 'other' and 'territory' fields of a
+ mapZone element and the last is s, its cleaned-up list of IANA
+ IDs."""
+
+ defaults, windows = {}, []
zones = self.supplement('windowsZones.xml')
- enum = self.__enumMap('country')
- badZones, unLands, defaults, windows = set(), set(), {}, {}
-
for name, attrs in zones.find('windowsZones/mapTimezones'):
if name != 'mapZone':
continue
- wid, code = attrs['other'], attrs['territory']
- data = dict(windowsId = wid,
- countryCode = code,
- ianaList = attrs['type'])
-
- try:
- key = lookup[wid]
- except KeyError:
- badZones.add(wid)
- key = 0
- data['windowsKey'] = key
+ wid, code, ianas = attrs['other'], attrs['territory'], []
+ for cldr in attrs['type'].split():
+ iana = alias.get(cldr, cldr)
+ if iana not in ianas:
+ ianas.append(iana)
- if code == u'001':
- defaults[key] = data['ianaList']
+ if code == '001':
+ assert len(ianas) == 1, (wid, *ianas)
+ defaults[wid] = ianas[0]
else:
- try:
- cid, name = enum[code]
- except KeyError:
- unLands.append(code)
- continue
- data.update(countryId = cid, country = name)
- windows[key, cid] = data
-
- if unLands:
- raise Error('Unknown country codes, please add to enumdata.py: '
- + ', '.join(sorted(unLands)))
-
- if badZones:
- raise Error('Unknown Windows IDs, please add to cldr2qtimezone.py: '
- + ', '.join(sorted(badZones)))
+ windows.append((wid, code, ' '.join(ianas)))
- return self.cldrVersion, defaults, windows
+ return defaults, windows
@property
def cldrVersion(self):
@@ -507,20 +593,20 @@ enumdata.py (keeping the old name as an alias):
return self.__cldrVersion
# Implementation details
- def __xml(self, path, cache = CacheDict(), read = minidom.parse, joinPath = os.path.join):
+ def __xml(self, relative_path: str, cache = CacheDict(), read = minidom.parse):
try:
- doc = cache[path]
+ doc = cache[relative_path]
except KeyError:
- cache[path] = doc = read(joinPath(self.root, *path)).documentElement
+ cache[relative_path] = doc = read(str(self.root.joinpath(relative_path))).documentElement
return doc
- def __open(self, path, joinPath=os.path.join):
- return open(joinPath(self.root, *path))
+ def __open(self, relative_path: str) -> TextIO:
+ return self.root.joinpath(relative_path).open()
@property
def __rootLocale(self, cache = []):
if not cache:
- cache.append(self.xml('common', 'main', 'root.xml'))
+ cache.append(self.xml('common/main/root.xml'))
return cache[0]
@property
@@ -530,7 +616,7 @@ enumdata.py (keeping the old name as an alias):
return cache[0]
@property
- def __numberSystems(self, cache = {}, joinPath=os.path.join):
+ def __numberSystems(self, cache = {}):
if not cache:
for ignore, attrs in self.supplement('numberingSystems.xml').find('numberingSystems'):
cache[attrs['id']] = attrs
@@ -563,7 +649,7 @@ enumdata.py (keeping the old name as an alias):
source = self.__supplementalData
for key in ('firstDay', 'weekendStart', 'weekendEnd'):
result = {}
- for ignore, attrs in source.find('weekData/' + key):
+ for ignore, attrs in source.find(f'weekData/{key}'):
assert ignore == key
day = attrs['day']
assert day in ('mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun'), day
@@ -579,8 +665,10 @@ enumdata.py (keeping the old name as an alias):
source = self.__supplementalData
for elt in source.findNodes('currencyData/region'):
iso, digits, rounding = '', 2, 1
+ # TODO: fractions/info[iso4217=DEFAULT] has rounding=0 - why do we differ ?
+ # Also: some fractions/info have cashDigits and cashRounding - should we use them ?
try:
- country = elt.dom.attributes['iso3166'].nodeValue
+ territory = elt.dom.attributes['iso3166'].nodeValue
except KeyError:
continue
for child in elt.findAllChildren('currency'):
@@ -596,16 +684,16 @@ enumdata.py (keeping the old name as an alias):
break
if iso:
for tag, data in source.find(
- 'currencyData/fractions/info[iso4217={}]'.format(iso)):
+ f'currencyData/fractions/info[iso4217={iso}]'):
digits = data['digits']
rounding = data['rounding']
- cache[country] = iso, digits, rounding
+ cache[territory] = iso, digits, rounding
assert cache
return cache
@property
- def __unDistinguishedAttributes(self, cache = {}, joinPath = os.path.join):
+ def __unDistinguishedAttributes(self, cache = {}):
"""Mapping from tag names to lists of attributes.
LDML defines some attributes as 'distinguishing': if a node
@@ -625,7 +713,7 @@ enumdata.py (keeping the old name as an alias):
return cache
- def __scanLdmlDtd(self, joinPath = os.path.join):
+ def __scanLdmlDtd(self):
"""Scan the LDML DTD, record CLDR version
Yields (tag, attrs) pairs: on elements with a given tag,
@@ -635,7 +723,7 @@ enumdata.py (keeping the old name as an alias):
Sets self.__cldrVersion as a side-effect, since this
information is found in the same file."""
- with self.__open(('common', 'dtd', 'ldml.dtd')) as dtd:
+ with self.__open('common/dtd/ldml.dtd') as dtd:
tag, ignored, last = None, None, None
for line in dtd:
@@ -670,15 +758,15 @@ enumdata.py (keeping the old name as an alias):
def __enumMap(self, key, cache = {}):
if not cache:
cache['variant'] = {'': (0, 'This should never be seen outside ldml.py')}
- # They're not actually lists: mappings from numeric value
- # to pairs of full name and short code. What we want, in
- # each case, is a mapping from code to the other two.
- from enumdata import language_list, script_list, country_list
- for form, book, empty in (('language', language_list, 'AnyLanguage'),
- ('script', script_list, 'AnyScript'),
- ('country', country_list, 'AnyCountry')):
- cache[form] = dict((pair[1], (num, pair[0]))
- for num, pair in book.items() if pair[0] != 'C')
+ # They're mappings from numeric value to pairs of full
+ # name and short code. What we want, in each case, is a
+ # mapping from code to the other two.
+ from enumdata import language_map, script_map, territory_map
+ for form, book, empty in (('language', language_map, 'AnyLanguage'),
+ ('script', script_map, 'AnyScript'),
+ ('territory', territory_map, 'AnyTerritory')):
+ cache[form] = {pair[1]: (num, pair[0])
+ for num, pair in book.items() if pair[0] != 'C'}
# (Have to filter out the C locale, as we give it the
# same (all space) code as AnyLanguage, whose code
# should probably be 'und' instead.)
@@ -693,9 +781,9 @@ enumdata.py (keeping the old name as an alias):
def __codeMap(self, key, cache = {},
# Maps our name for it to CLDR's name:
naming = {'language': 'languages', 'script': 'scripts',
- 'country': 'territories', 'variant': 'variants'}):
+ 'territory': 'territories', 'variant': 'variants'}):
if not cache:
- root = self.xml('common', 'main', 'en.xml').root.findUniqueChild('localeDisplayNames')
+ root = self.xml('common/main/en.xml').root.findUniqueChild('localeDisplayNames')
for dst, src in naming.items():
cache[dst] = dict(self.__codeMapScan(root.findUniqueChild(src)))
assert cache
@@ -721,7 +809,13 @@ enumdata.py (keeping the old name as an alias):
except (KeyError, ValueError, TypeError):
pass
else:
- if key not in seen or not elt.attributes.has_key('alt'):
+ # Prefer stand-alone forms of names when present, ignore other
+ # alt="..." entries. For example, Traditional and Simplified
+ # Han omit "Han" in the plain form, but include it for
+ # stand-alone. As the stand-alone version appears later, it
+ # over-writes the plain one.
+ if (key not in seen or 'alt' not in elt.attributes
+ or elt.attributes['alt'].nodeValue == 'stand-alone'):
yield key, value
seen.add(key)
@@ -730,7 +824,8 @@ enumdata.py (keeping the old name as an alias):
def __parentLocale(self, cache = {}):
# see http://www.unicode.org/reports/tr35/#Parent_Locales
if not cache:
- for tag, attrs in self.__supplementalData.find('parentLocales'):
+ for tag, attrs in self.__supplementalData.find('parentLocales',
+ ('component',)):
parent = attrs.get('parent', '')
for child in attrs['locales'].split():
cache[child] = parent
@@ -738,10 +833,9 @@ enumdata.py (keeping the old name as an alias):
return cache
- def __localeAsDoc(self, name, aliasFor = None,
- joinPath = os.path.join, exists = os.path.isfile):
- path = ('common', 'main', name + '.xml')
- if exists(joinPath(self.root, *path)):
+ def __localeAsDoc(self, name: str, aliasFor = None):
+ path = f'common/main/{name}.xml'
+ if self.root.joinpath(path).exists():
elt = self.__xml(path)
for child in Node(elt).findAllChildren('alias'):
try:
@@ -754,8 +848,8 @@ enumdata.py (keeping the old name as an alias):
return elt
if aliasFor:
- raise Error('Fatal error: found an alias "{}" -> "{}", but found no file for the alias'
- .format(aliasFor, name))
+ raise Error(f'Fatal error: found an alias "{aliasFor}" -> "{name}", '
+ 'but found no file for the alias')
def __scanLocaleRoots(self, name):
while name and name != 'root':
@@ -780,4 +874,4 @@ enumdata.py (keeping the old name as an alias):
return chain
# Unpolute the namespace: we don't need to export these.
-del minidom, CacheDict, os
+del minidom, CacheDict