diff options
Diffstat (limited to 'util/locale_database/qlocalexml.py')
-rw-r--r-- | util/locale_database/qlocalexml.py | 306 |
1 files changed, 152 insertions, 154 deletions
diff --git a/util/locale_database/qlocalexml.py b/util/locale_database/qlocalexml.py index e5aadba995..5cb56c2165 100644 --- a/util/locale_database/qlocalexml.py +++ b/util/locale_database/qlocalexml.py @@ -1,31 +1,5 @@ -# coding=utf8 -############################################################################# -## -## Copyright (C) 2020 The Qt Company Ltd. -## Contact: https://www.qt.io/licensing/ -## -## This file is part of the test suite of the Qt Toolkit. -## -## $QT_BEGIN_LICENSE:GPL-EXCEPT$ -## Commercial License Usage -## Licensees holding valid commercial Qt licenses may use this file in -## accordance with the commercial license agreement provided with the -## Software or, alternatively, in accordance with the terms contained in -## a written agreement between you and The Qt Company. For licensing terms -## and conditions see https://www.qt.io/terms-conditions. For further -## information use the contact form at https://www.qt.io/contact-us. -## -## GNU General Public License Usage -## Alternatively, this file may be used under the terms of the GNU -## General Public License version 3 as published by the Free Software -## Foundation with exceptions as appearing in the file LICENSE.GPL3-EXCEPT -## included in the packaging of this file. Please review the following -## information to ensure the GNU General Public License requirements will -## be met: https://www.gnu.org/licenses/gpl-3.0.html. -## -## $QT_END_LICENSE$ -## -############################################################################# +# Copyright (C) 2021 The Qt Company Ltd. +# SPDX-License-Identifier: LicenseRef-Qt-Commercial OR GPL-3.0-only WITH Qt-GPL-exception-1.0 """Shared serialization-scanning code for QLocaleXML format. Provides classes: @@ -35,15 +9,23 @@ Provides classes: Support: Spacer -- provides control over indentation of the output. + +RelaxNG schema for the used file format can be found in qlocalexml.rnc. +QLocaleXML files can be validated using: + + jing -c qlocalexml.rnc <file.xml> + +You can download jing from https://relaxng.org/jclark/jing.html if your +package manager lacks the jing package. """ -from __future__ import print_function + from xml.sax.saxutils import escape from localetools import Error # Tools used by Locale: def camel(seq): - yield seq.next() + yield next(seq) for word in seq: yield word.capitalize() @@ -51,88 +33,39 @@ def camelCase(words): return ''.join(camel(iter(words))) def addEscapes(s): - return ''.join(c if n < 128 else '\\x{:02x}'.format(n) + return ''.join(c if n < 128 else f'\\x{n:02x}' for n, c in ((ord(c), c) for c in s)) def startCount(c, text): # strspn """First index in text where it doesn't have a character in c""" assert text and text[0] in c try: - return (j for j, d in enumerate(text) if d not in c).next() + return next((j for j, d in enumerate(text) if d not in c)) except StopIteration: return len(text) -def convertFormat(format): - """Convert date/time format-specier from CLDR to Qt - - Match up (as best we can) the differences between: - * https://www.unicode.org/reports/tr35/tr35-dates.html#Date_Field_Symbol_Table - * QDateTimeParser::parseFormat() and QLocalePrivate::dateTimeToString() - """ - # Compare and contrast dateconverter.py's convert_date(). - # Need to (check consistency and) reduce redundancy ! - result = "" - i = 0 - while i < len(format): - if format[i] == "'": - result += "'" - i += 1 - while i < len(format) and format[i] != "'": - result += format[i] - i += 1 - if i < len(format): - result += "'" - i += 1 - else: - s = format[i:] - if s.startswith('E'): # week-day - n = startCount('E', s) - if n < 3: - result += 'ddd' - elif n == 4: - result += 'dddd' - else: # 5: narrow, 6 short; but should be name, not number :-( - result += 'd' if n < 6 else 'dd' - i += n - elif s[0] in 'ab': # am/pm - # 'b' should distinguish noon/midnight, too :-( - result += "AP" - i += startCount('ab', s) - elif s.startswith('S'): # fractions of seconds: count('S') == number of decimals to show - result += 'z' - i += startCount('S', s) - elif s.startswith('V'): # long time zone specifiers (and a deprecated short ID) - result += 't' - i += startCount('V', s) - elif s[0] in 'zv': # zone - # Should use full name, e.g. "Central European Time", if 'zzzz' :-( - # 'v' should get generic non-location format, e.g. PT for "Pacific Time", no DST indicator - result += "t" - i += startCount('zv', s) - else: - result += format[i] - i += 1 - - return result - class QLocaleXmlReader (object): def __init__(self, filename): self.root = self.__parse(filename) - # Lists of (id, name, code) triples: - languages = tuple(self.__loadMap('language')) - scripts = tuple(self.__loadMap('script')) - countries = tuple(self.__loadMap('country')) + + from enumdata import language_map, script_map, territory_map + # Lists of (id, enum name, code, en.xml name) tuples: + languages = tuple(self.__loadMap('language', language_map)) + scripts = tuple(self.__loadMap('script', script_map)) + territories = tuple(self.__loadMap('territory', territory_map)) self.__likely = tuple(self.__likelySubtagsMap()) - # Mappings {ID: (name, code)} - self.languages = dict((v[0], v[1:]) for v in languages) - self.scripts = dict((v[0], v[1:]) for v in scripts) - self.countries = dict((v[0], v[1:]) for v in countries) - # Private mappings {name: (ID, code)} - self.__langByName = dict((v[1], (v[0], v[2])) for v in languages) - self.__textByName = dict((v[1], (v[0], v[2])) for v in scripts) - self.__landByName = dict((v[1], (v[0], v[2])) for v in countries) + + # Mappings {ID: (enum name, code, en.xml name)} + self.languages = {v[0]: v[1:] for v in languages} + self.scripts = {v[0]: v[1:] for v in scripts} + self.territories = {v[0]: v[1:] for v in territories} + + # Private mappings {enum name: (ID, code)} + self.__langByName = {v[1]: (v[0], v[2]) for v in languages} + self.__textByName = {v[1]: (v[0], v[2]) for v in scripts} + self.__landByName = {v[1]: (v[0], v[2]) for v in territories} # Other properties: - self.dupes = set(v[1] for v in languages) & set(v[1] for v in countries) + self.__dupes = set(v[1] for v in languages) & set(v[1] for v in territories) self.cldrVersion = self.__firstChildText(self.root, "version") def loadLocaleMap(self, calendars, grumble = lambda text: None): @@ -142,31 +75,31 @@ class QLocaleXmlReader (object): locale = Locale.fromXmlData(lambda k: kid(elt, k), calendars) language = self.__langByName[locale.language][0] script = self.__textByName[locale.script][0] - country = self.__landByName[locale.country][0] + territory = self.__landByName[locale.territory][0] if language != 1: # C - if country == 0: - grumble('loadLocaleMap: No country id for "{}"\n'.format(locale.language)) + if territory == 0: + grumble(f'loadLocaleMap: No territory id for "{locale.language}"\n') if script == 0: - # Find default script for the given language and country - see: + # Find default script for the given language and territory - see: # http://www.unicode.org/reports/tr35/#Likely_Subtags try: try: - to = likely[(locale.language, 'AnyScript', locale.country)] + to = likely[(locale.language, 'AnyScript', locale.territory)] except KeyError: - to = likely[(locale.language, 'AnyScript', 'AnyCountry')] + to = likely[(locale.language, 'AnyScript', 'AnyTerritory')] except KeyError: pass else: locale.script = to[1] script = self.__textByName[locale.script][0] - yield (language, script, country), locale + yield (language, script, territory), locale def languageIndices(self, locales): index = 0 - for key, value in self.languages.iteritems(): + for key, value in self.languages.items(): i, count = 0, locales.count(key) if count > 0: i = index @@ -190,26 +123,53 @@ class QLocaleXmlReader (object): '_'.join(tag(give)), ids(give)) def defaultMap(self): - """Map language and script to their default country by ID. + """Map language and script to their default territory by ID. - Yields ((language, script), country) wherever the likely + Yields ((language, script), territory) wherever the likely sub-tags mapping says language's default locale uses the given - script and country.""" + script and territory.""" for have, give in self.__likely: - if have[1:] == ('AnyScript', 'AnyCountry') and give[2] != 'AnyCountry': + if have[1:] == ('AnyScript', 'AnyTerritory') and give[2] != 'AnyTerritory': assert have[0] == give[0], (have, give) yield ((self.__langByName[give[0]][0], self.__textByName[give[1]][0]), self.__landByName[give[2]][0]) + def enumify(self, name, suffix): + """Stick together the parts of an enumdata.py name. + + Names given in enumdata.py include spaces and hyphens that we + can't include in an identifier, such as the name of a member + of an enum type. Removing those would lose the word + boundaries, so make sure each word starts with a capital (but + don't simply capitalize() as some names contain words, + e.g. McDonald, that have later capitals in them). + + We also need to resolve duplication between languages and + territories (by adding a suffix to each) and add Script to the + ends of script-names that don't already end in it.""" + name = name.replace('-', ' ') + # Don't .capitalize() as McDonald is already camel-case (see enumdata.py): + name = ''.join(word[0].upper() + word[1:] for word in name.split()) + if suffix != 'Script': + assert not(name in self.__dupes and name.endswith(suffix)) + return name + suffix if name in self.__dupes else name + + if not name.endswith(suffix): + name += suffix + if name in self.__dupes: + raise Error(f'The script name "{name}" is messy') + return name + # Implementation details: - def __loadMap(self, category): + def __loadMap(self, category, enum): kid = self.__firstChildText - for element in self.__eachEltInGroup(self.root, category + 'List', category): - yield int(kid(element, 'id')), kid(element, 'name'), kid(element, 'code') + for element in self.__eachEltInGroup(self.root, f'{category}List', category): + key = int(kid(element, 'id')) + yield key, enum[key][0], kid(element, 'code'), kid(element, 'name') def __likelySubtagsMap(self): - def triplet(element, keys=('language', 'script', 'country'), kid = self.__firstChildText): + def triplet(element, keys=('language', 'script', 'territory'), kid = self.__firstChildText): return tuple(kid(element, key) for key in keys) kid = self.__firstChildElt @@ -246,7 +206,7 @@ class QLocaleXmlReader (object): return child child = child.nextSibling - raise Error('No {} child found'.format(name)) + raise Error(f'No {name} child found') @classmethod def __firstChildText(cls, elt, key): @@ -302,7 +262,7 @@ class Spacer (object): elif line.startswith('<') and not line.startswith('<!'): cut = line.find('>') tag = (line[1:] if cut < 0 else line[1 : cut]).strip().split()[0] - if '</{}>'.format(tag) not in line: + if f'</{tag}>' not in line: self.current += self.__each return indent + line + '\n' @@ -333,10 +293,28 @@ class QLocaleXmlWriter (object): self.__write('<localeDatabase>') # Output of various sections, in their usual order: - def enumData(self, languages, scripts, countries): - self.__enumTable('language', languages) - self.__enumTable('script', scripts) - self.__enumTable('country', countries) + def enumData(self, code2name): + """Output name/id/code tables for language, script and territory. + + Parameter, code2name, is a function taking 'language', + 'script' or 'territory' and returning a lookup function that + maps codes, of the relevant type, to their English names. This + lookup function is passed a code and the name, both taken from + enumdata.py, that QLocale uses, so the .get() of a dict will + work. The English name from this lookup will be used by + QLocale::*ToString() for the enum member whose name is based + on the enumdata.py name passed as fallback to the lookup.""" + from enumdata import language_map, script_map, territory_map + self.__enumTable('language', language_map, code2name) + self.__enumTable('script', script_map, code2name) + self.__enumTable('territory', territory_map, code2name) + # Prepare to detect any unused codes (see __writeLocale(), close()): + self.__languages = set(p[1] for p in language_map.values() + if not p[1].isspace()) + self.__scripts = set(p[1] for p in script_map.values() + if p[1] != 'Zzzz') + self.__territories = set(p[1] for p in territory_map.values() + if p[1] != 'ZZ') def likelySubTags(self, entries): self.__openTag('likelySubtags') @@ -350,13 +328,11 @@ class QLocaleXmlWriter (object): def locales(self, locales, calendars): self.__openTag('localeList') self.__openTag('locale') - Locale.C(calendars).toXml(self.inTag, calendars) + self.__writeLocale(Locale.C(calendars), calendars) self.__closeTag('locale') - keys = locales.keys() - keys.sort() - for key in keys: + for key in sorted(locales.keys()): self.__openTag('locale') - locales[key].toXml(self.inTag, calendars) + self.__writeLocale(locales[key], calendars) self.__closeTag('locale') self.__closeTag('localeList') @@ -364,13 +340,27 @@ class QLocaleXmlWriter (object): self.inTag('version', cldrVersion) def inTag(self, tag, text): - self.__write('<{0}>{1}</{0}>'.format(tag, text)) + self.__write(f'<{tag}>{text}</{tag}>') - def close(self): + def close(self, grumble): + """Finish writing and grumble about any issues discovered.""" if self.__rawOutput != self.__complain: self.__write('</localeDatabase>') self.__rawOutput = self.__complain + if self.__languages or self.__scripts or self.__territories: + grumble('Some enum members are unused, corresponding to these tags:\n') + import textwrap + def kvetch(kind, seq, g = grumble, w = textwrap.wrap): + g('\n\t'.join(w(f' {kind}: {", ".join(sorted(seq))}', width=80)) + '\n') + if self.__languages: + kvetch('Languages', self.__languages) + if self.__scripts: + kvetch('Scripts', self.__scripts) + if self.__territories: + kvetch('Territories', self.__territories) + grumble('It may make sense to deprecate them.\n') + # Implementation details @staticmethod def __printit(text): @@ -379,28 +369,39 @@ class QLocaleXmlWriter (object): def __complain(text): raise Error('Attempted to write data after closing :-(') - def __enumTable(self, tag, table): - self.__openTag(tag + 'List') - for key, value in table.iteritems(): + @staticmethod + def __xmlSafe(text): + return text.replace('&', '&').replace('<', '<').replace('>', '>') + + def __enumTable(self, tag, table, code2name): + self.__openTag(f'{tag}List') + enname, safe = code2name(tag), self.__xmlSafe + for key, (name, code) in table.items(): self.__openTag(tag) - self.inTag('name', value[0]) + self.inTag('name', safe(enname(code, name))) self.inTag('id', key) - self.inTag('code', value[1]) + self.inTag('code', code) self.__closeTag(tag) - self.__closeTag(tag + 'List') + self.__closeTag(f'{tag}List') def __likelySubTag(self, tag, likely): self.__openTag(tag) self.inTag('language', likely[0]) self.inTag('script', likely[1]) - self.inTag('country', likely[2]) + self.inTag('territory', likely[2]) # self.inTag('variant', likely[3]) self.__closeTag(tag) + def __writeLocale(self, locale, calendars): + locale.toXml(self.inTag, calendars) + self.__languages.discard(locale.language_code) + self.__scripts.discard(locale.script_code) + self.__territories.discard(locale.territory_code) + def __openTag(self, tag): - self.__write('<{}>'.format(tag)) + self.__write(f'<{tag}>') def __closeTag(self, tag): - self.__write('</{}>'.format(tag)) + self.__write(f'</{tag}>') def __write(self, line): self.__rawOutput(self.__wrap(line)) @@ -432,16 +433,16 @@ class Locale (object): __asint = ("currencyDigits", "currencyRounding") # Convert day-name to Qt day-of-week number: __asdow = ("firstDayOfWeek", "weekendStart", "weekendEnd") - # Convert from CLDR format-strings to QDateTimeParser ones: - __asfmt = ("longDateFormat", "shortDateFormat", "longTimeFormat", "shortTimeFormat") # Just use the raw text: - __astxt = ("language", "languageEndonym", "script", "country", "countryEndonym", + __astxt = ("language", "languageEndonym", "script", "territory", "territoryEndonym", "decimal", "group", "zero", "list", "percent", "minus", "plus", "exp", "quotationStart", "quotationEnd", "alternateQuotationStart", "alternateQuotationEnd", "listPatternPartStart", "listPatternPartMiddle", "listPatternPartEnd", "listPatternPartTwo", "am", "pm", + "longDateFormat", "shortDateFormat", + "longTimeFormat", "shortTimeFormat", 'byte_unit', 'byte_si_quantified', 'byte_iec_quantified', "currencyIsoCode", "currencySymbol", "currencyDisplayName", "currencyFormat", "currencyNegativeFormat") @@ -466,14 +467,11 @@ class Locale (object): for k in cls.__asdow: data[k] = cls.__qDoW[lookup(k)] - for k in cls.__asfmt: - data[k] = convertFormat(lookup(k)) - for k in cls.__astxt + tuple(cls.propsMonthDay('days')): data['listDelim' if k == 'list' else k] = lookup(k) for k in cls.propsMonthDay('months'): - data[k] = dict((cal, lookup('_'.join((k, cal)))) for cal in calendars) + data[k] = {cal: lookup('_'.join((k, cal))) for cal in calendars} grouping = lookup('groupSizes').split(';') data.update(groupLeast = int(grouping[0]), @@ -493,15 +491,15 @@ class Locale (object): form used by CLDR; its default is ('gregorian',). """ get = lambda k: getattr(self, k) - for key in ('language', 'script', 'country'): + for key in ('language', 'script', 'territory'): write(key, get(key)) - write('{}code'.format(key), get('{}_code'.format(key))) + write(f'{key}code', get(f'{key}_code')) for key in ('decimal', 'group', 'zero', 'list', 'percent', 'minus', 'plus', 'exp'): write(key, get(key)) - for key in ('languageEndonym', 'countryEndonym', + for key in ('languageEndonym', 'territoryEndonym', 'quotationStart', 'quotationEnd', 'alternateQuotationStart', 'alternateQuotationEnd', 'listPatternPartStart', 'listPatternPartMiddle', @@ -517,7 +515,7 @@ class Locale (object): '_'.join((k, cal)) for k in self.propsMonthDay('months') for cal in calendars): - write(key, escape(get(key)).encode('utf-8')) + write(key, escape(get(key))) write('groupSizes', ';'.join(str(x) for x in get('groupSizes'))) for key in ('currencyDigits', 'currencyRounding'): @@ -554,9 +552,9 @@ class Locale (object): (fullName, fullName), (firstThree, firstThree), (number, initial)), - 'islamic': ((u'Muharram', u'Safar', u'Rabiʻ I', u'Rabiʻ II', u'Jumada I', - u'Jumada II', u'Rajab', u'Shaʻban', u'Ramadan', u'Shawwal', - u'Dhuʻl-Qiʻdah', u'Dhuʻl-Hijjah'), + 'islamic': (('Muharram', 'Safar', 'Rabiʻ I', 'Rabiʻ II', 'Jumada I', + 'Jumada II', 'Rajab', 'Shaʻban', 'Ramadan', 'Shawwal', + 'Dhuʻl-Qiʻdah', 'Dhuʻl-Hijjah'), (fullName, fullName), (islamicShort, islamicShort), (number, number)), @@ -565,7 +563,7 @@ class Locale (object): (fullName, fullName), (fullName, fullName), (number, number)), - }, + }, sizes=('long', 'short', 'narrow')): for cal in calendars: try: @@ -590,7 +588,7 @@ class Locale (object): return cls(cls.__monthNames(calendars), language='C', language_code='0', languageEndonym='', script='AnyScript', script_code='0', - country='AnyCountry', country_code='0', countryEndonym='', + territory='AnyTerritory', territory_code='0', territoryEndonym='', groupSizes=(3, 3, 1), decimal='.', group=',', list=';', percent='%', zero='0', minus='-', plus='+', exp='e', @@ -605,8 +603,8 @@ class Locale (object): byte_iec_quantified=';'.join(q.upper() + 'iB' for q in quantifiers), am='AM', pm='PM', firstDayOfWeek='mon', weekendStart='sat', weekendEnd='sun', - longDateFormat='EEEE, d MMMM yyyy', shortDateFormat='d MMM yyyy', - longTimeFormat='HH:mm:ss z', shortTimeFormat='HH:mm:ss', + longDateFormat='dddd, d MMMM yyyy', shortDateFormat='d MMM yyyy', + longTimeFormat='HH:mm:ss t', shortTimeFormat='HH:mm:ss', longDays=';'.join(days), shortDays=';'.join(d[:3] for d in days), narrowDays='7;1;2;3;4;5;6', |