diff options
Diffstat (limited to 'util/locale_database/qlocalexml.py')
-rw-r--r-- | util/locale_database/qlocalexml.py | 238 |
1 files changed, 146 insertions, 92 deletions
diff --git a/util/locale_database/qlocalexml.py b/util/locale_database/qlocalexml.py index 4fcfe32a43..a104402e23 100644 --- a/util/locale_database/qlocalexml.py +++ b/util/locale_database/qlocalexml.py @@ -1,31 +1,5 @@ -# coding=utf8 -############################################################################# -## -## Copyright (C) 2020 The Qt Company Ltd. -## Contact: https://www.qt.io/licensing/ -## -## This file is part of the test suite of the Qt Toolkit. -## -## $QT_BEGIN_LICENSE:GPL-EXCEPT$ -## Commercial License Usage -## Licensees holding valid commercial Qt licenses may use this file in -## accordance with the commercial license agreement provided with the -## Software or, alternatively, in accordance with the terms contained in -## a written agreement between you and The Qt Company. For licensing terms -## and conditions see https://www.qt.io/terms-conditions. For further -## information use the contact form at https://www.qt.io/contact-us. -## -## GNU General Public License Usage -## Alternatively, this file may be used under the terms of the GNU -## General Public License version 3 as published by the Free Software -## Foundation with exceptions as appearing in the file LICENSE.GPL3-EXCEPT -## included in the packaging of this file. Please review the following -## information to ensure the GNU General Public License requirements will -## be met: https://www.gnu.org/licenses/gpl-3.0.html. -## -## $QT_END_LICENSE$ -## -############################################################################# +# Copyright (C) 2021 The Qt Company Ltd. +# SPDX-License-Identifier: LicenseRef-Qt-Commercial OR GPL-3.0-only WITH Qt-GPL-exception-1.0 """Shared serialization-scanning code for QLocaleXML format. Provides classes: @@ -35,15 +9,23 @@ Provides classes: Support: Spacer -- provides control over indentation of the output. + +RelaxNG schema for the used file format can be found in qlocalexml.rnc. +QLocaleXML files can be validated using: + + jing -c qlocalexml.rnc <file.xml> + +You can download jing from https://relaxng.org/jclark/jing.html if your +package manager lacks the jing package. """ -from __future__ import print_function + from xml.sax.saxutils import escape from localetools import Error # Tools used by Locale: def camel(seq): - yield seq.next() + yield next(seq) for word in seq: yield word.capitalize() @@ -51,14 +33,14 @@ def camelCase(words): return ''.join(camel(iter(words))) def addEscapes(s): - return ''.join(c if n < 128 else '\\x{:02x}'.format(n) + return ''.join(c if n < 128 else f'\\x{n:02x}' for n, c in ((ord(c), c) for c in s)) def startCount(c, text): # strspn """First index in text where it doesn't have a character in c""" assert text and text[0] in c try: - return (j for j, d in enumerate(text) if d not in c).next() + return next((j for j, d in enumerate(text) if d not in c)) except StopIteration: return len(text) @@ -118,21 +100,25 @@ def convertFormat(format): class QLocaleXmlReader (object): def __init__(self, filename): self.root = self.__parse(filename) - # Lists of (id, name, code) triples: - languages = tuple(self.__loadMap('language')) - scripts = tuple(self.__loadMap('script')) - countries = tuple(self.__loadMap('country')) + + from enumdata import language_map, script_map, territory_map + # Lists of (id, enum name, code, en.xml name) tuples: + languages = tuple(self.__loadMap('language', language_map)) + scripts = tuple(self.__loadMap('script', script_map)) + territories = tuple(self.__loadMap('territory', territory_map)) self.__likely = tuple(self.__likelySubtagsMap()) - # Mappings {ID: (name, code)} - self.languages = dict((v[0], v[1:]) for v in languages) - self.scripts = dict((v[0], v[1:]) for v in scripts) - self.countries = dict((v[0], v[1:]) for v in countries) - # Private mappings {name: (ID, code)} - self.__langByName = dict((v[1], (v[0], v[2])) for v in languages) - self.__textByName = dict((v[1], (v[0], v[2])) for v in scripts) - self.__landByName = dict((v[1], (v[0], v[2])) for v in countries) + + # Mappings {ID: (enum name, code, en.xml name)} + self.languages = {v[0]: v[1:] for v in languages} + self.scripts = {v[0]: v[1:] for v in scripts} + self.territories = {v[0]: v[1:] for v in territories} + + # Private mappings {enum name: (ID, code)} + self.__langByName = {v[1]: (v[0], v[2]) for v in languages} + self.__textByName = {v[1]: (v[0], v[2]) for v in scripts} + self.__landByName = {v[1]: (v[0], v[2]) for v in territories} # Other properties: - self.dupes = set(v[1] for v in languages) & set(v[1] for v in countries) + self.__dupes = set(v[1] for v in languages) & set(v[1] for v in territories) self.cldrVersion = self.__firstChildText(self.root, "version") def loadLocaleMap(self, calendars, grumble = lambda text: None): @@ -142,18 +128,18 @@ class QLocaleXmlReader (object): locale = Locale.fromXmlData(lambda k: kid(elt, k), calendars) language = self.__langByName[locale.language][0] script = self.__textByName[locale.script][0] - country = self.__landByName[locale.country][0] + territory = self.__landByName[locale.territory][0] if language != 1: # C - if country == 0: - grumble('loadLocaleMap: No country id for "{}"\n'.format(locale.language)) + if territory == 0: + grumble(f'loadLocaleMap: No territory id for "{locale.language}"\n') if script == 0: - # Find default script for the given language and country - see: + # Find default script for the given language and territory - see: # http://www.unicode.org/reports/tr35/#Likely_Subtags try: try: - to = likely[(locale.language, 'AnyScript', locale.country)] + to = likely[(locale.language, 'AnyScript', locale.territory)] except KeyError: to = likely[(locale.language, 'AnyScript', 'AnyTerritory')] except KeyError: @@ -162,11 +148,11 @@ class QLocaleXmlReader (object): locale.script = to[1] script = self.__textByName[locale.script][0] - yield (language, script, country), locale + yield (language, script, territory), locale def languageIndices(self, locales): index = 0 - for key, value in self.languages.iteritems(): + for key, value in self.languages.items(): i, count = 0, locales.count(key) if count > 0: i = index @@ -190,11 +176,11 @@ class QLocaleXmlReader (object): '_'.join(tag(give)), ids(give)) def defaultMap(self): - """Map language and script to their default country by ID. + """Map language and script to their default territory by ID. - Yields ((language, script), country) wherever the likely + Yields ((language, script), territory) wherever the likely sub-tags mapping says language's default locale uses the given - script and country.""" + script and territory.""" for have, give in self.__likely: if have[1:] == ('AnyScript', 'AnyTerritory') and give[2] != 'AnyTerritory': assert have[0] == give[0], (have, give) @@ -202,14 +188,41 @@ class QLocaleXmlReader (object): self.__textByName[give[1]][0]), self.__landByName[give[2]][0]) + def enumify(self, name, suffix): + """Stick together the parts of an enumdata.py name. + + Names given in enumdata.py include spaces and hyphens that we + can't include in an identifier, such as the name of a member + of an enum type. Removing those would lose the word + boundaries, so make sure each word starts with a capital (but + don't simply capitalize() as some names contain words, + e.g. McDonald, that have later capitals in them). + + We also need to resolve duplication between languages and + territories (by adding a suffix to each) and add Script to the + ends of script-names that don't already end in it.""" + name = name.replace('-', ' ') + # Don't .capitalize() as McDonald is already camel-case (see enumdata.py): + name = ''.join(word[0].upper() + word[1:] for word in name.split()) + if suffix != 'Script': + assert not(name in self.__dupes and name.endswith(suffix)) + return name + suffix if name in self.__dupes else name + + if not name.endswith(suffix): + name += suffix + if name in self.__dupes: + raise Error(f'The script name "{name}" is messy') + return name + # Implementation details: - def __loadMap(self, category): + def __loadMap(self, category, enum): kid = self.__firstChildText - for element in self.__eachEltInGroup(self.root, category + 'List', category): - yield int(kid(element, 'id')), kid(element, 'name'), kid(element, 'code') + for element in self.__eachEltInGroup(self.root, f'{category}List', category): + key = int(kid(element, 'id')) + yield key, enum[key][0], kid(element, 'code'), kid(element, 'name') def __likelySubtagsMap(self): - def triplet(element, keys=('language', 'script', 'country'), kid = self.__firstChildText): + def triplet(element, keys=('language', 'script', 'territory'), kid = self.__firstChildText): return tuple(kid(element, key) for key in keys) kid = self.__firstChildElt @@ -246,7 +259,7 @@ class QLocaleXmlReader (object): return child child = child.nextSibling - raise Error('No {} child found'.format(name)) + raise Error(f'No {name} child found') @classmethod def __firstChildText(cls, elt, key): @@ -302,7 +315,7 @@ class Spacer (object): elif line.startswith('<') and not line.startswith('<!'): cut = line.find('>') tag = (line[1:] if cut < 0 else line[1 : cut]).strip().split()[0] - if '</{}>'.format(tag) not in line: + if f'</{tag}>' not in line: self.current += self.__each return indent + line + '\n' @@ -333,10 +346,28 @@ class QLocaleXmlWriter (object): self.__write('<localeDatabase>') # Output of various sections, in their usual order: - def enumData(self, languages, scripts, countries): - self.__enumTable('language', languages) - self.__enumTable('script', scripts) - self.__enumTable('country', countries) + def enumData(self, code2name): + """Output name/id/code tables for language, script and territory. + + Parameter, code2name, is a function taking 'language', + 'script' or 'territory' and returning a lookup function that + maps codes, of the relevant type, to their English names. This + lookup function is passed a code and the name, both taken from + enumdata.py, that QLocale uses, so the .get() of a dict will + work. The English name from this lookup will be used by + QLocale::*ToString() for the enum member whose name is based + on the enumdata.py name passed as fallback to the lookup.""" + from enumdata import language_map, script_map, territory_map + self.__enumTable('language', language_map, code2name) + self.__enumTable('script', script_map, code2name) + self.__enumTable('territory', territory_map, code2name) + # Prepare to detect any unused codes (see __writeLocale(), close()): + self.__languages = set(p[1] for p in language_map.values() + if not p[1].isspace()) + self.__scripts = set(p[1] for p in script_map.values() + if p[1] != 'Zzzz') + self.__territories = set(p[1] for p in territory_map.values() + if p[1] != 'ZZ') def likelySubTags(self, entries): self.__openTag('likelySubtags') @@ -350,13 +381,11 @@ class QLocaleXmlWriter (object): def locales(self, locales, calendars): self.__openTag('localeList') self.__openTag('locale') - Locale.C(calendars).toXml(self.inTag, calendars) + self.__writeLocale(Locale.C(calendars), calendars) self.__closeTag('locale') - keys = locales.keys() - keys.sort() - for key in keys: + for key in sorted(locales.keys()): self.__openTag('locale') - locales[key].toXml(self.inTag, calendars) + self.__writeLocale(locales[key], calendars) self.__closeTag('locale') self.__closeTag('localeList') @@ -364,13 +393,27 @@ class QLocaleXmlWriter (object): self.inTag('version', cldrVersion) def inTag(self, tag, text): - self.__write('<{0}>{1}</{0}>'.format(tag, text)) + self.__write(f'<{tag}>{text}</{tag}>') - def close(self): + def close(self, grumble): + """Finish writing and grumble about any issues discovered.""" if self.__rawOutput != self.__complain: self.__write('</localeDatabase>') self.__rawOutput = self.__complain + if self.__languages or self.__scripts or self.__territories: + grumble('Some enum members are unused, corresponding to these tags:\n') + import textwrap + def kvetch(kind, seq, g = grumble, w = textwrap.wrap): + g('\n\t'.join(w(f' {kind}: {", ".join(sorted(seq))}', width=80)) + '\n') + if self.__languages: + kvetch('Languages', self.__languages) + if self.__scripts: + kvetch('Scripts', self.__scripts) + if self.__territories: + kvetch('Territories', self.__territories) + grumble('It may make sense to deprecate them.\n') + # Implementation details @staticmethod def __printit(text): @@ -379,28 +422,39 @@ class QLocaleXmlWriter (object): def __complain(text): raise Error('Attempted to write data after closing :-(') - def __enumTable(self, tag, table): - self.__openTag(tag + 'List') - for key, value in table.iteritems(): + @staticmethod + def __xmlSafe(text): + return text.replace('&', '&').replace('<', '<').replace('>', '>') + + def __enumTable(self, tag, table, code2name): + self.__openTag(f'{tag}List') + enname, safe = code2name(tag), self.__xmlSafe + for key, (name, code) in table.items(): self.__openTag(tag) - self.inTag('name', value[0]) + self.inTag('name', safe(enname(code, name))) self.inTag('id', key) - self.inTag('code', value[1]) + self.inTag('code', code) self.__closeTag(tag) - self.__closeTag(tag + 'List') + self.__closeTag(f'{tag}List') def __likelySubTag(self, tag, likely): self.__openTag(tag) self.inTag('language', likely[0]) self.inTag('script', likely[1]) - self.inTag('country', likely[2]) + self.inTag('territory', likely[2]) # self.inTag('variant', likely[3]) self.__closeTag(tag) + def __writeLocale(self, locale, calendars): + locale.toXml(self.inTag, calendars) + self.__languages.discard(locale.language_code) + self.__scripts.discard(locale.script_code) + self.__territories.discard(locale.territory_code) + def __openTag(self, tag): - self.__write('<{}>'.format(tag)) + self.__write(f'<{tag}>') def __closeTag(self, tag): - self.__write('</{}>'.format(tag)) + self.__write(f'</{tag}>') def __write(self, line): self.__rawOutput(self.__wrap(line)) @@ -435,7 +489,7 @@ class Locale (object): # Convert from CLDR format-strings to QDateTimeParser ones: __asfmt = ("longDateFormat", "shortDateFormat", "longTimeFormat", "shortTimeFormat") # Just use the raw text: - __astxt = ("language", "languageEndonym", "script", "country", "countryEndonym", + __astxt = ("language", "languageEndonym", "script", "territory", "territoryEndonym", "decimal", "group", "zero", "list", "percent", "minus", "plus", "exp", "quotationStart", "quotationEnd", @@ -473,7 +527,7 @@ class Locale (object): data['listDelim' if k == 'list' else k] = lookup(k) for k in cls.propsMonthDay('months'): - data[k] = dict((cal, lookup('_'.join((k, cal)))) for cal in calendars) + data[k] = {cal: lookup('_'.join((k, cal))) for cal in calendars} grouping = lookup('groupSizes').split(';') data.update(groupLeast = int(grouping[0]), @@ -493,15 +547,15 @@ class Locale (object): form used by CLDR; its default is ('gregorian',). """ get = lambda k: getattr(self, k) - for key in ('language', 'script', 'country'): + for key in ('language', 'script', 'territory'): write(key, get(key)) - write('{}code'.format(key), get('{}_code'.format(key))) + write(f'{key}code', get(f'{key}_code')) for key in ('decimal', 'group', 'zero', 'list', 'percent', 'minus', 'plus', 'exp'): write(key, get(key)) - for key in ('languageEndonym', 'countryEndonym', + for key in ('languageEndonym', 'territoryEndonym', 'quotationStart', 'quotationEnd', 'alternateQuotationStart', 'alternateQuotationEnd', 'listPatternPartStart', 'listPatternPartMiddle', @@ -517,7 +571,7 @@ class Locale (object): '_'.join((k, cal)) for k in self.propsMonthDay('months') for cal in calendars): - write(key, escape(get(key)).encode('utf-8')) + write(key, escape(get(key))) write('groupSizes', ';'.join(str(x) for x in get('groupSizes'))) for key in ('currencyDigits', 'currencyRounding'): @@ -554,9 +608,9 @@ class Locale (object): (fullName, fullName), (firstThree, firstThree), (number, initial)), - 'islamic': ((u'Muharram', u'Safar', u'Rabiʻ I', u'Rabiʻ II', u'Jumada I', - u'Jumada II', u'Rajab', u'Shaʻban', u'Ramadan', u'Shawwal', - u'Dhuʻl-Qiʻdah', u'Dhuʻl-Hijjah'), + 'islamic': (('Muharram', 'Safar', 'Rabiʻ I', 'Rabiʻ II', 'Jumada I', + 'Jumada II', 'Rajab', 'Shaʻban', 'Ramadan', 'Shawwal', + 'Dhuʻl-Qiʻdah', 'Dhuʻl-Hijjah'), (fullName, fullName), (islamicShort, islamicShort), (number, number)), @@ -565,7 +619,7 @@ class Locale (object): (fullName, fullName), (fullName, fullName), (number, number)), - }, + }, sizes=('long', 'short', 'narrow')): for cal in calendars: try: @@ -590,7 +644,7 @@ class Locale (object): return cls(cls.__monthNames(calendars), language='C', language_code='0', languageEndonym='', script='AnyScript', script_code='0', - country='AnyTerritory', country_code='0', countryEndonym='', + territory='AnyTerritory', territory_code='0', territoryEndonym='', groupSizes=(3, 3, 1), decimal='.', group=',', list=';', percent='%', zero='0', minus='-', plus='+', exp='e', |