summaryrefslogtreecommitdiffstats
path: root/util/locale_database/cldr.py
diff options
context:
space:
mode:
Diffstat (limited to 'util/locale_database/cldr.py')
-rw-r--r--util/locale_database/cldr.py153
1 files changed, 89 insertions, 64 deletions
diff --git a/util/locale_database/cldr.py b/util/locale_database/cldr.py
index 3448b89582..b610a1cfec 100644
--- a/util/locale_database/cldr.py
+++ b/util/locale_database/cldr.py
@@ -1,31 +1,5 @@
-# -*- coding: utf-8; -*-
-#############################################################################
-##
-## Copyright (C) 2021 The Qt Company Ltd.
-## Contact: https://www.qt.io/licensing/
-##
-## This file is part of the test suite of the Qt Toolkit.
-##
-## $QT_BEGIN_LICENSE:GPL-EXCEPT$
-## Commercial License Usage
-## Licensees holding valid commercial Qt licenses may use this file in
-## accordance with the commercial license agreement provided with the
-## Software or, alternatively, in accordance with the terms contained in
-## a written agreement between you and The Qt Company. For licensing terms
-## and conditions see https://www.qt.io/terms-conditions. For further
-## information use the contact form at https://www.qt.io/contact-us.
-##
-## GNU General Public License Usage
-## Alternatively, this file may be used under the terms of the GNU
-## General Public License version 3 as published by the Free Software
-## Foundation with exceptions as appearing in the file LICENSE.GPL3-EXCEPT
-## included in the packaging of this file. Please review the following
-## information to ensure the GNU General Public License requirements will
-## be met: https://www.gnu.org/licenses/gpl-3.0.html.
-##
-## $QT_END_LICENSE$
-##
-#############################################################################
+# Copyright (C) 2021 The Qt Company Ltd.
+# SPDX-License-Identifier: LicenseRef-Qt-Commercial OR GPL-3.0-only WITH Qt-GPL-exception-1.0
"""Digesting the CLDR's data.
Provides two classes:
@@ -42,6 +16,7 @@ from weakref import WeakValueDictionary as CacheDict
from pathlib import Path
from ldml import Error, Node, XmlScanner, Supplement, LocaleScanner
+from localetools import names_clash
from qlocalexml import Locale
class CldrReader (object):
@@ -100,9 +75,8 @@ class CldrReader (object):
pass # self.__wrapped(self.whitter, 'Skipping likelySubtags (for unknown codes): ', skips)
def readLocales(self, calendars = ('gregorian',)):
- locales = tuple(self.__allLocales(calendars))
- return dict(((k.language_id, k.script_id, k.territory_id, k.variant_code),
- k) for k in locales)
+ return {(k.language_id, k.script_id, k.territory_id, k.variant_code): k
+ for k in self.__allLocales(calendars)}
def __allLocales(self, calendars):
def skip(locale, reason):
@@ -279,6 +253,9 @@ class CldrAccess (object):
inheritance, where relevant."""
return LocaleScanner(name, self.__localeRoots(name), self.__rootLocale)
+ def englishNaming(self, tag): # see QLocaleXmlWriter.enumData()
+ return self.__codeMap(tag).get
+
@property
def fileLocales(self) -> Iterable[str]:
"""Generator for locale IDs seen in file-names.
@@ -374,16 +351,16 @@ class CldrAccess (object):
parts.append(text)
if len(parts) > 1:
parts[-1] = 'and ' + parts[-1]
- assert parts
+ else:
+ assert parts
+ if parts[0].startswith('variant'):
+ raise Error(f'No support for {parts[0]}',
+ language, script, territory, variant)
raise Error('Unknown ' + ', '.join(parts),
language, script, territory, variant)
@staticmethod
- def __checkEnum(given, proper, scraps,
- remap = { 'å': 'a', 'ã': 'a', 'ç': 'c', 'é': 'e', 'í': 'i', 'ü': 'u'},
- prefix = { 'St.': 'Saint', 'U.S.': 'United States' },
- suffixes = ( 'Han', ),
- skip = '\u02bc'):
+ def __checkEnum(given, proper, scraps):
# Each is a { code: full name } mapping
for code, name in given.items():
try: right = proper[code]
@@ -393,21 +370,9 @@ class CldrAccess (object):
if code not in scraps:
yield name, f'[Found no CLDR name for code {code}]'
continue
- if name == right: continue
- ok = right.replace('&', 'And')
- for k, v in prefix.items():
- if ok.startswith(k + ' '):
- ok = v + ok[len(k):]
- while '(' in ok:
- try: f, t = ok.index('('), ok.index(')')
- except ValueError: break
- ok = ok[:f].rstrip() + ' ' + ok[t:].lstrip()
- if any(name == ok + ' ' + s for s in suffixes):
- continue
- if ''.join(ch for ch in name.lower() if not ch.isspace()) in ''.join(
- remap.get(ch, ch) for ch in ok.lower() if ch.isalpha() and ch not in skip):
- continue
- yield name, ok
+ cleaned = names_clash(right, name)
+ if cleaned:
+ yield name, cleaned
def checkEnumData(self, grumble):
scraps = set()
@@ -415,9 +380,9 @@ class CldrAccess (object):
for f in k.split('_'):
scraps.add(f)
from enumdata import language_map, territory_map, script_map
- language = dict((v, k) for k, v in language_map.values() if not v.isspace())
- territory = dict((v, k) for k, v in territory_map.values() if v != 'ZZ')
- script = dict((v, k) for k, v in script_map.values() if v != 'Zzzz')
+ language = {v: k for k, v in language_map.values() if not v.isspace()}
+ territory = {v: k for k, v in territory_map.values() if v != 'ZZ'}
+ script = {v: k for k, v in script_map.values() if v != 'Zzzz'}
lang = dict(self.__checkEnum(language, self.__codeMap('language'), scraps))
land = dict(self.__checkEnum(territory, self.__codeMap('territory'), scraps))
text = dict(self.__checkEnum(script, self.__codeMap('script'), scraps))
@@ -440,13 +405,66 @@ enumdata.py (keeping the old name as an alias):
+ '\n')
grumble('\n')
+ def bcp47Aliases(self):
+ """Reads the mapping from CLDR IDs to IANA IDs
+
+ CLDR identifies timezones in various ways but its standard
+ 'name' for them, here described as a CLDR ID, has the form of
+ an IANA ID. CLDR IDs are stable across time, where IANA IDs
+ may be revised over time, for example Asia/Calcutta became
+ Asia/Kolkata. When a new zone is added to CLDR, it gets the
+ then-current IANA ID as its CLDR ID; if it is later
+ superseded, CLDR continues using the old ID, so we need a
+ mapping from that to current IANA IDs. Helpfully, CLDR
+ provides information about aliasing among time-zone IDs.
+
+ The file common/bcp47/timezone.xml has keyword/key/type
+ elements with attributes:
+
+ name -- zone code (ignore)
+ description -- long name for exemplar location, including
+ territory
+
+ and some of:
+
+ deprecated -- ignore entry if present (has no alias)
+ preferred -- only present if deprecated
+ since -- version at which this entry was added (ignore)
+ alias -- space-joined sequence of IANA-form IDs; first is CLDR ID
+ iana -- if present, repeats the alias entry that's the modern IANA ID
+
+ This returns a pair (alias, naming) wherein: alias is a
+ mapping from IANA-format IDs to actual IANA IDs, that maps
+ each alias to the contemporary ID used by IANA; and naming is
+ a mapping from IANA ID to the description it and its aliases
+ shared in their keyword/key/type entry."""
+ # File has the same form as supplements:
+ root = Supplement(Node(self.__xml('common/bcp47/timezone.xml')))
+
+ # If we ever need a mapping back to CLDR ID, we can make
+ # (description, space-joined-list) the naming values.
+ alias, naming = {}, {} # { alias: iana }, { iana: description }
+ for item, attrs in root.find('keyword/key/type', exclude=('deprecated',)):
+ assert 'description' in attrs, item
+ assert 'alias' in attrs, item
+ names = attrs['alias'].split()
+ assert not any(name in alias for name in names), item
+ # CLDR ID is names[0]; if IANA now uses another name for
+ # it, this is given as the iana attribute.
+ ianaid, fullName = attrs.get('iana', names[0]), attrs['description']
+ alias.update({name: ianaid for name in names})
+ assert not ianaid in naming
+ naming[ianaid] = fullName
+
+ return alias, naming
+
def readWindowsTimeZones(self, lookup): # For use by cldr2qtimezone.py
"""Digest CLDR's MS-Win time-zone name mapping.
MS-Win have their own eccentric names for time-zones. CLDR
helpfully provides a translation to more orthodox names.
- Singe argument, lookup, is a mapping from known MS-Win names
+ Single argument, lookup, is a mapping from known MS-Win names
for locales to a unique integer index (starting at 1).
The XML structure we read has the form:
@@ -474,7 +492,7 @@ enumdata.py (keeping the old name as an alias):
wid, code = attrs['other'], attrs['territory']
data = dict(windowsId = wid,
territoryCode = code,
- ianaList = attrs['type'])
+ ianaList = ' '.join(attrs['type'].split()))
try:
key = lookup[wid]
@@ -674,15 +692,15 @@ enumdata.py (keeping the old name as an alias):
def __enumMap(self, key, cache = {}):
if not cache:
cache['variant'] = {'': (0, 'This should never be seen outside ldml.py')}
- # They're not actually lists: mappings from numeric value
- # to pairs of full name and short code. What we want, in
- # each case, is a mapping from code to the other two.
+ # They're mappings from numeric value to pairs of full
+ # name and short code. What we want, in each case, is a
+ # mapping from code to the other two.
from enumdata import language_map, script_map, territory_map
for form, book, empty in (('language', language_map, 'AnyLanguage'),
('script', script_map, 'AnyScript'),
('territory', territory_map, 'AnyTerritory')):
- cache[form] = dict((pair[1], (num, pair[0]))
- for num, pair in book.items() if pair[0] != 'C')
+ cache[form] = {pair[1]: (num, pair[0])
+ for num, pair in book.items() if pair[0] != 'C'}
# (Have to filter out the C locale, as we give it the
# same (all space) code as AnyLanguage, whose code
# should probably be 'und' instead.)
@@ -725,7 +743,13 @@ enumdata.py (keeping the old name as an alias):
except (KeyError, ValueError, TypeError):
pass
else:
- if key not in seen or 'alt' not in elt.attributes:
+ # Prefer stand-alone forms of names when present, ignore other
+ # alt="..." entries. For example, Traditional and Simplified
+ # Han omit "Han" in the plain form, but include it for
+ # stand-alone. As the stand-alone version appears later, it
+ # over-writes the plain one.
+ if (key not in seen or 'alt' not in elt.attributes
+ or elt.attributes['alt'].nodeValue == 'stand-alone'):
yield key, value
seen.add(key)
@@ -734,7 +758,8 @@ enumdata.py (keeping the old name as an alias):
def __parentLocale(self, cache = {}):
# see http://www.unicode.org/reports/tr35/#Parent_Locales
if not cache:
- for tag, attrs in self.__supplementalData.find('parentLocales'):
+ for tag, attrs in self.__supplementalData.find('parentLocales',
+ ('component',)):
parent = attrs.get('parent', '')
for child in attrs['locales'].split():
cache[child] = parent