diff options
Diffstat (limited to 'util/locale_database/cldr.py')
-rw-r--r-- | util/locale_database/cldr.py | 68 |
1 files changed, 33 insertions, 35 deletions
diff --git a/util/locale_database/cldr.py b/util/locale_database/cldr.py index 91b46d6a01..9e0bae9667 100644 --- a/util/locale_database/cldr.py +++ b/util/locale_database/cldr.py @@ -16,6 +16,7 @@ from weakref import WeakValueDictionary as CacheDict from pathlib import Path from ldml import Error, Node, XmlScanner, Supplement, LocaleScanner +from localetools import names_clash from qlocalexml import Locale class CldrReader (object): @@ -74,9 +75,8 @@ class CldrReader (object): pass # self.__wrapped(self.whitter, 'Skipping likelySubtags (for unknown codes): ', skips) def readLocales(self, calendars = ('gregorian',)): - locales = tuple(self.__allLocales(calendars)) - return dict(((k.language_id, k.script_id, k.territory_id, k.variant_code), - k) for k in locales) + return {(k.language_id, k.script_id, k.territory_id, k.variant_code): k + for k in self.__allLocales(calendars)} def __allLocales(self, calendars): def skip(locale, reason): @@ -253,6 +253,9 @@ class CldrAccess (object): inheritance, where relevant.""" return LocaleScanner(name, self.__localeRoots(name), self.__rootLocale) + def englishNaming(self, tag): # see QLocaleXmlWriter.enumData() + return self.__codeMap(tag).get + @property def fileLocales(self) -> Iterable[str]: """Generator for locale IDs seen in file-names. @@ -348,16 +351,16 @@ class CldrAccess (object): parts.append(text) if len(parts) > 1: parts[-1] = 'and ' + parts[-1] - assert parts + else: + assert parts + if parts[0].startswith('variant'): + raise Error(f'No support for {parts[0]}', + language, script, territory, variant) raise Error('Unknown ' + ', '.join(parts), language, script, territory, variant) @staticmethod - def __checkEnum(given, proper, scraps, - remap = { 'å': 'a', 'ã': 'a', 'ç': 'c', 'é': 'e', 'í': 'i', 'ü': 'u'}, - prefix = { 'St.': 'Saint', 'U.S.': 'United States' }, - suffixes = ( 'Han', ), - skip = '\u02bc'): + def __checkEnum(given, proper, scraps): # Each is a { code: full name } mapping for code, name in given.items(): try: right = proper[code] @@ -367,21 +370,9 @@ class CldrAccess (object): if code not in scraps: yield name, f'[Found no CLDR name for code {code}]' continue - if name == right: continue - ok = right.replace('&', 'And') - for k, v in prefix.items(): - if ok.startswith(k + ' '): - ok = v + ok[len(k):] - while '(' in ok: - try: f, t = ok.index('('), ok.index(')') - except ValueError: break - ok = ok[:f].rstrip() + ' ' + ok[t:].lstrip() - if any(name == ok + ' ' + s for s in suffixes): - continue - if ''.join(ch for ch in name.lower() if not ch.isspace()) in ''.join( - remap.get(ch, ch) for ch in ok.lower() if ch.isalpha() and ch not in skip): - continue - yield name, ok + cleaned = names_clash(right, name) + if cleaned: + yield name, cleaned def checkEnumData(self, grumble): scraps = set() @@ -389,9 +380,9 @@ class CldrAccess (object): for f in k.split('_'): scraps.add(f) from enumdata import language_map, territory_map, script_map - language = dict((v, k) for k, v in language_map.values() if not v.isspace()) - territory = dict((v, k) for k, v in territory_map.values() if v != 'ZZ') - script = dict((v, k) for k, v in script_map.values() if v != 'Zzzz') + language = {v: k for k, v in language_map.values() if not v.isspace()} + territory = {v: k for k, v in territory_map.values() if v != 'ZZ'} + script = {v: k for k, v in script_map.values() if v != 'Zzzz'} lang = dict(self.__checkEnum(language, self.__codeMap('language'), scraps)) land = dict(self.__checkEnum(territory, self.__codeMap('territory'), scraps)) text = dict(self.__checkEnum(script, self.__codeMap('script'), scraps)) @@ -448,7 +439,7 @@ enumdata.py (keeping the old name as an alias): wid, code = attrs['other'], attrs['territory'] data = dict(windowsId = wid, territoryCode = code, - ianaList = attrs['type']) + ianaList = ' '.join(attrs['type'].split())) try: key = lookup[wid] @@ -648,15 +639,15 @@ enumdata.py (keeping the old name as an alias): def __enumMap(self, key, cache = {}): if not cache: cache['variant'] = {'': (0, 'This should never be seen outside ldml.py')} - # They're not actually lists: mappings from numeric value - # to pairs of full name and short code. What we want, in - # each case, is a mapping from code to the other two. + # They're mappings from numeric value to pairs of full + # name and short code. What we want, in each case, is a + # mapping from code to the other two. from enumdata import language_map, script_map, territory_map for form, book, empty in (('language', language_map, 'AnyLanguage'), ('script', script_map, 'AnyScript'), ('territory', territory_map, 'AnyTerritory')): - cache[form] = dict((pair[1], (num, pair[0])) - for num, pair in book.items() if pair[0] != 'C') + cache[form] = {pair[1]: (num, pair[0]) + for num, pair in book.items() if pair[0] != 'C'} # (Have to filter out the C locale, as we give it the # same (all space) code as AnyLanguage, whose code # should probably be 'und' instead.) @@ -699,7 +690,13 @@ enumdata.py (keeping the old name as an alias): except (KeyError, ValueError, TypeError): pass else: - if key not in seen or 'alt' not in elt.attributes: + # Prefer stand-alone forms of names when present, ignore other + # alt="..." entries. For example, Traditional and Simplified + # Han omit "Han" in the plain form, but include it for + # stand-alone. As the stand-alone version appears later, it + # over-writes the plain one. + if (key not in seen or 'alt' not in elt.attributes + or elt.attributes['alt'].nodeValue == 'stand-alone'): yield key, value seen.add(key) @@ -708,7 +705,8 @@ enumdata.py (keeping the old name as an alias): def __parentLocale(self, cache = {}): # see http://www.unicode.org/reports/tr35/#Parent_Locales if not cache: - for tag, attrs in self.__supplementalData.find('parentLocales'): + for tag, attrs in self.__supplementalData.find('parentLocales', + ('component',)): parent = attrs.get('parent', '') for child in attrs['locales'].split(): cache[child] = parent |