diff options
Diffstat (limited to 'util/locale_database/cldr.py')
-rw-r--r-- | util/locale_database/cldr.py | 291 |
1 files changed, 204 insertions, 87 deletions
diff --git a/util/locale_database/cldr.py b/util/locale_database/cldr.py index 91b46d6a01..75d687dd11 100644 --- a/util/locale_database/cldr.py +++ b/util/locale_database/cldr.py @@ -16,6 +16,7 @@ from weakref import WeakValueDictionary as CacheDict from pathlib import Path from ldml import Error, Node, XmlScanner, Supplement, LocaleScanner +from localetools import names_clash from qlocalexml import Locale class CldrReader (object): @@ -73,10 +74,80 @@ class CldrReader (object): # more out. pass # self.__wrapped(self.whitter, 'Skipping likelySubtags (for unknown codes): ', skips) + def zoneData(self): + """Locale-independent timezone data. + + Returns a triple (alias, defaults, winIds) in which: + * alias is a mapping from aliases for IANA zone IDs, that + have the form of IANA IDs, to actual current IANA IDs; in + particular, this maps each CLDR zone ID to its + corresponding IANA ID. + * defaults maps each Windows name for a zone to the IANA ID + to use for it by default (when no territory is specified, + or when no entry in winIds matches the given Windows name + and territory). + * winIds is a mapping {(winId, land): ianaList} from Windows + name and territory code to the space-joined list of IANA + IDs associated with the Windows name in the given + territory. + + and reports on any territories found in CLDR timezone data + that are not mentioned in enumdata.territory_map, on any + Windows IDs given in zonedata.windowsIdList that are no longer + covered by the CLDR data.""" + alias, ignored = self.root.bcp47Aliases() + defaults, winIds = self.root.readWindowsTimeZones(alias) + + from zonedata import windowsIdList + winUnused = set(n for n, o in windowsIdList).difference( + set(defaults).union(w for w, t, ids in winIds)) + if winUnused: + joined = "\n\t".join(winUnused) + self.whitter.write( + f'No Windows ID in\n\t{joined}\nis still in use.\n' + 'They could be removed at the next major version.\n') + + # Check for duplicate entries in winIds: + last = ('', '', '') + winDup = {} + for triple in sorted(winIds): + if triple[:2] == last[:2]: + try: + seq = winDup[triple[:2]] + except KeyError: + seq = winDup[triple[:2]] = [] + seq.append(triple[-1]) + if winDup: + joined = '\n\t'.join(f'{t}, {w}: ", ".join(ids)' + for (w, t), ids in winDup.items()) + self.whitter.write( + f'Duplicated (territory, Windows ID) entries:\n\t{joined}\n') + winIds = [trip for trip in winIds if trip[:2] not in winDup] + for (w, t), seq in winDup.items(): + ianalist = [] + for ids in seq: + for iana in ids.split(): + if iana not in ianaList: + ianaList.append(iana) + winIds.append((w, t, ' '.join(ianaList))) + + from enumdata import territory_map + unLand = set(t for w, t, ids in winIds).difference( + v[1] for k, v in territory_map.items()) + if unLand: + self.grumble.write( + 'Unknown territory codes in timezone data: ' + f'{", ".join(unLand)}\n' + 'Skipping Windows zone mappings for these territories\n') + winIds = [(w, t, ids) for w, t, ids in winIds if t not in unLand] + + # Convert list of triples to mapping: + winIds = {(w, t): ids for w, t, ids in winIds} + return alias, defaults, winIds + def readLocales(self, calendars = ('gregorian',)): - locales = tuple(self.__allLocales(calendars)) - return dict(((k.language_id, k.script_id, k.territory_id, k.variant_code), - k) for k in locales) + return {(k.language_id, k.script_id, k.territory_id, k.variant_id): k + for k in self.__allLocales(calendars)} def __allLocales(self, calendars): def skip(locale, reason): @@ -193,7 +264,7 @@ class CldrReader (object): language = names[0], language_code = language, language_id = ids[0], script = names[1], script_code = script, script_id = ids[1], territory = names[2], territory_code = territory, territory_id = ids[2], - variant_code = variant) + variant_code = variant, variant_id = ids[3]) firstDay, weStart, weEnd = self.root.weekData(territory) assert all(day in ('mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun') @@ -253,6 +324,9 @@ class CldrAccess (object): inheritance, where relevant.""" return LocaleScanner(name, self.__localeRoots(name), self.__rootLocale) + def englishNaming(self, tag): # see QLocaleXmlWriter.enumData() + return self.__codeMap(tag).get + @property def fileLocales(self) -> Iterable[str]: """Generator for locale IDs seen in file-names. @@ -348,16 +422,16 @@ class CldrAccess (object): parts.append(text) if len(parts) > 1: parts[-1] = 'and ' + parts[-1] - assert parts + else: + assert parts + if parts[0].startswith('variant'): + raise Error(f'No support for {parts[0]}', + language, script, territory, variant) raise Error('Unknown ' + ', '.join(parts), language, script, territory, variant) @staticmethod - def __checkEnum(given, proper, scraps, - remap = { 'å': 'a', 'ã': 'a', 'ç': 'c', 'é': 'e', 'í': 'i', 'ü': 'u'}, - prefix = { 'St.': 'Saint', 'U.S.': 'United States' }, - suffixes = ( 'Han', ), - skip = '\u02bc'): + def __checkEnum(given, proper, scraps): # Each is a { code: full name } mapping for code, name in given.items(): try: right = proper[code] @@ -367,21 +441,9 @@ class CldrAccess (object): if code not in scraps: yield name, f'[Found no CLDR name for code {code}]' continue - if name == right: continue - ok = right.replace('&', 'And') - for k, v in prefix.items(): - if ok.startswith(k + ' '): - ok = v + ok[len(k):] - while '(' in ok: - try: f, t = ok.index('('), ok.index(')') - except ValueError: break - ok = ok[:f].rstrip() + ' ' + ok[t:].lstrip() - if any(name == ok + ' ' + s for s in suffixes): - continue - if ''.join(ch for ch in name.lower() if not ch.isspace()) in ''.join( - remap.get(ch, ch) for ch in ok.lower() if ch.isalpha() and ch not in skip): - continue - yield name, ok + cleaned = names_clash(right, name) + if cleaned: + yield name, cleaned def checkEnumData(self, grumble): scraps = set() @@ -389,9 +451,9 @@ class CldrAccess (object): for f in k.split('_'): scraps.add(f) from enumdata import language_map, territory_map, script_map - language = dict((v, k) for k, v in language_map.values() if not v.isspace()) - territory = dict((v, k) for k, v in territory_map.values() if v != 'ZZ') - script = dict((v, k) for k, v in script_map.values() if v != 'Zzzz') + language = {v: k for k, v in language_map.values() if not v.isspace()} + territory = {v: k for k, v in territory_map.values() if v != 'ZZ'} + script = {v: k for k, v in script_map.values() if v != 'Zzzz'} lang = dict(self.__checkEnum(language, self.__codeMap('language'), scraps)) land = dict(self.__checkEnum(territory, self.__codeMap('territory'), scraps)) text = dict(self.__checkEnum(script, self.__codeMap('script'), scraps)) @@ -414,69 +476,115 @@ enumdata.py (keeping the old name as an alias): + '\n') grumble('\n') - def readWindowsTimeZones(self, lookup): # For use by cldr2qtimezone.py + def bcp47Aliases(self): + """Reads the mapping from CLDR IDs to IANA IDs + + CLDR identifies timezones in various ways but its standard + 'name' for them, here described as a CLDR ID, has the form of + an IANA ID. CLDR IDs are stable across time, where IANA IDs + may be revised over time, for example Asia/Calcutta became + Asia/Kolkata. When a new zone is added to CLDR, it gets the + then-current IANA ID as its CLDR ID; if it is later + superseded, CLDR continues using the old ID, so we need a + mapping from that to current IANA IDs. Helpfully, CLDR + provides information about aliasing among time-zone IDs. + + The file common/bcp47/timezone.xml has keyword/key/type + elements with attributes: + + name -- zone code (ignore) + description -- long name for exemplar location, including + territory + + and some of: + + deprecated -- ignore entry if present (has no alias) + preferred -- only present if deprecated + since -- version at which this entry was added (ignore) + alias -- space-joined sequence of IANA-form IDs; first is CLDR ID + iana -- if present, repeats the alias entry that's the modern IANA ID + + This returns a pair (alias, naming) wherein: alias is a + mapping from IANA-format IDs to actual IANA IDs, that maps + each alias to the contemporary ID used by IANA; and naming is + a mapping from IANA ID to the description it and its aliases + shared in their keyword/key/type entry.""" + # File has the same form as supplements: + root = Supplement(Node(self.__xml('common/bcp47/timezone.xml'))) + + # If we ever need a mapping back to CLDR ID, we can make + # (description, space-joined-list) the naming values. + alias, naming = {}, {} # { alias: iana }, { iana: description } + for item, attrs in root.find('keyword/key/type', exclude=('deprecated',)): + assert 'description' in attrs, item + assert 'alias' in attrs, item + names = attrs['alias'].split() + assert not any(name in alias for name in names), item + # CLDR ID is names[0]; if IANA now uses another name for + # it, this is given as the iana attribute. + ianaid, fullName = attrs.get('iana', names[0]), attrs['description'] + alias.update({name: ianaid for name in names}) + assert not ianaid in naming + naming[ianaid] = fullName + + return alias, naming + + def readWindowsTimeZones(self, alias): """Digest CLDR's MS-Win time-zone name mapping. - MS-Win have their own eccentric names for time-zones. CLDR - helpfully provides a translation to more orthodox names. - - Single argument, lookup, is a mapping from known MS-Win names - for locales to a unique integer index (starting at 1). - - The XML structure we read has the form: - - <supplementalData> - <windowsZones> - <mapTimezones otherVersion="..." typeVersion="..."> - <!-- (UTC-08:00) Pacific Time (US & Canada) --> - <mapZone other="Pacific Standard Time" territory="001" type="America/Los_Angeles"/> - <mapZone other="Pacific Standard Time" territory="CA" type="America/Vancouver America/Dawson America/Whitehorse"/> - <mapZone other="Pacific Standard Time" territory="US" type="America/Los_Angeles America/Metlakatla"/> - <mapZone other="Pacific Standard Time" territory="ZZ" type="PST8PDT"/> - </mapTimezones> - </windowsZones> - </supplementalData> -""" + Single argument, alias, should be the first part of the pair + returned by a call to bcp47Aliases(); it shall be used to + transform CLDR IDs into IANA IDs. + + MS-Win have their own eccentric names for time-zones. CLDR + helpfully provides a translation to more orthodox names, + albeit these are CLDR IDs - see bcp47Aliases() - rather than + (up to date) IANA IDs. The windowsZones.xml supplement has + supplementalData/windowsZones/mapTimezones/mapZone nodes with + attributes + + territory -- ISO code + type -- space-joined sequence of CLDR IDs of zones + other -- Windows name of these zones in the given territory + + When 'territory' is '001', type is always just a single CLDR + zone ID. This is the default zone for the given Windows name. + + For each mapZone node, its type is split on spacing and + cleaned up as follows. Those entries that are keys of alias + are mapped thereby to their canonical IANA IDs; all others are + presumed to be canonical IANA IDs and left unchanged. Any + later duplicates of earlier entries are omitted. The result + list of IANA IDs is joined with single spaces between to give + a string s. + + Returns a twople (defaults, windows) in which defaults is a + mapping, from Windows ID to IANA ID (derived from the mapZone + nodes with territory='001'), and windows is a list of triples + (Windows ID, territory code, IANA ID list) in which the first + two entries are the 'other' and 'territory' fields of a + mapZone element and the last is s, its cleaned-up list of IANA + IDs.""" + + defaults, windows = {}, [] zones = self.supplement('windowsZones.xml') - enum = self.__enumMap('territory') - badZones, unLands, defaults, windows = set(), set(), {}, {} - for name, attrs in zones.find('windowsZones/mapTimezones'): if name != 'mapZone': continue - wid, code = attrs['other'], attrs['territory'] - data = dict(windowsId = wid, - territoryCode = code, - ianaList = attrs['type']) - - try: - key = lookup[wid] - except KeyError: - badZones.add(wid) - key = 0 - data['windowsKey'] = key + wid, code, ianas = attrs['other'], attrs['territory'], [] + for cldr in attrs['type'].split(): + iana = alias.get(cldr, cldr) + if iana not in ianas: + ianas.append(iana) if code == '001': - defaults[key] = data['ianaList'] + assert len(ianas) == 1, (wid, *ianas) + defaults[wid] = ianas[0] else: - try: - cid, name = enum[code] - except KeyError: - unLands.append(code) - continue - data.update(territoryId = cid, territory = name) - windows[key, cid] = data - - if unLands: - raise Error('Unknown territory codes, please add to enumdata.py: ' - + ', '.join(sorted(unLands))) - - if badZones: - raise Error('Unknown Windows IDs, please add to cldr2qtimezone.py: ' - + ', '.join(sorted(badZones))) + windows.append((wid, code, ' '.join(ianas))) - return self.cldrVersion, defaults, windows + return defaults, windows @property def cldrVersion(self): @@ -557,6 +665,8 @@ enumdata.py (keeping the old name as an alias): source = self.__supplementalData for elt in source.findNodes('currencyData/region'): iso, digits, rounding = '', 2, 1 + # TODO: fractions/info[iso4217=DEFAULT] has rounding=0 - why do we differ ? + # Also: some fractions/info have cashDigits and cashRounding - should we use them ? try: territory = elt.dom.attributes['iso3166'].nodeValue except KeyError: @@ -648,15 +758,15 @@ enumdata.py (keeping the old name as an alias): def __enumMap(self, key, cache = {}): if not cache: cache['variant'] = {'': (0, 'This should never be seen outside ldml.py')} - # They're not actually lists: mappings from numeric value - # to pairs of full name and short code. What we want, in - # each case, is a mapping from code to the other two. + # They're mappings from numeric value to pairs of full + # name and short code. What we want, in each case, is a + # mapping from code to the other two. from enumdata import language_map, script_map, territory_map for form, book, empty in (('language', language_map, 'AnyLanguage'), ('script', script_map, 'AnyScript'), ('territory', territory_map, 'AnyTerritory')): - cache[form] = dict((pair[1], (num, pair[0])) - for num, pair in book.items() if pair[0] != 'C') + cache[form] = {pair[1]: (num, pair[0]) + for num, pair in book.items() if pair[0] != 'C'} # (Have to filter out the C locale, as we give it the # same (all space) code as AnyLanguage, whose code # should probably be 'und' instead.) @@ -699,7 +809,13 @@ enumdata.py (keeping the old name as an alias): except (KeyError, ValueError, TypeError): pass else: - if key not in seen or 'alt' not in elt.attributes: + # Prefer stand-alone forms of names when present, ignore other + # alt="..." entries. For example, Traditional and Simplified + # Han omit "Han" in the plain form, but include it for + # stand-alone. As the stand-alone version appears later, it + # over-writes the plain one. + if (key not in seen or 'alt' not in elt.attributes + or elt.attributes['alt'].nodeValue == 'stand-alone'): yield key, value seen.add(key) @@ -708,7 +824,8 @@ enumdata.py (keeping the old name as an alias): def __parentLocale(self, cache = {}): # see http://www.unicode.org/reports/tr35/#Parent_Locales if not cache: - for tag, attrs in self.__supplementalData.find('parentLocales'): + for tag, attrs in self.__supplementalData.find('parentLocales', + ('component',)): parent = attrs.get('parent', '') for child in attrs['locales'].split(): cache[child] = parent |