diff options
Diffstat (limited to 'util/locale_database/ldml.py')
-rw-r--r-- | util/locale_database/ldml.py | 215 |
1 files changed, 119 insertions, 96 deletions
diff --git a/util/locale_database/ldml.py b/util/locale_database/ldml.py index 110e5b7573..b94c242172 100644 --- a/util/locale_database/ldml.py +++ b/util/locale_database/ldml.py @@ -1,30 +1,5 @@ -############################################################################# -## -## Copyright (C) 2020 The Qt Company Ltd. -## Contact: https://www.qt.io/licensing/ -## -## This file is part of the test suite of the Qt Toolkit. -## -## $QT_BEGIN_LICENSE:GPL-EXCEPT$ -## Commercial License Usage -## Licensees holding valid commercial Qt licenses may use this file in -## accordance with the commercial license agreement provided with the -## Software or, alternatively, in accordance with the terms contained in -## a written agreement between you and The Qt Company. For licensing terms -## and conditions see https://www.qt.io/terms-conditions. For further -## information use the contact form at https://www.qt.io/contact-us. -## -## GNU General Public License Usage -## Alternatively, this file may be used under the terms of the GNU -## General Public License version 3 as published by the Free Software -## Foundation with exceptions as appearing in the file LICENSE.GPL3-EXCEPT -## included in the packaging of this file. Please review the following -## information to ensure the GNU General Public License requirements will -## be met: https://www.gnu.org/licenses/gpl-3.0.html. -## -## $QT_END_LICENSE$ -## -############################################################################# +# Copyright (C) 2020 The Qt Company Ltd. +# SPDX-License-Identifier: LicenseRef-Qt-Commercial OR GPL-3.0-only WITH Qt-GPL-exception-1.0 """Parsing the Locale Data Markup Language It's an XML format, so the raw parsing of XML is, of course, delegated @@ -46,6 +21,13 @@ See individual classes for further detail. from localetools import Error from dateconverter import convert_date +# The github version of CLDR uses '↑↑↑' to indicate "inherit" +INHERIT = '↑↑↑' + +def _attrsFromDom(dom): + return { k: (v if isinstance(v, str) else v.nodeValue) + for k, v in dom.attributes.items() } + class Node (object): """Wrapper for an arbitrary DOM node. @@ -75,6 +57,9 @@ class Node (object): else: self.draft = max(draft, self.draftScore(attr)) + def attributes(self): + return _attrsFromDom(self.dom) + def findAllChildren(self, tag, wanted = None, allDull = False): """All children that do have the given tag and attributes. @@ -124,7 +109,7 @@ class Node (object): one.""" seq = self.findAllChildren(tag) try: - node = seq.next() + node = next(seq) except StopIteration: raise Error('No child found where one was expected', tag) for it in seq: @@ -191,17 +176,35 @@ class XmlScanner (object): return elts class Supplement (XmlScanner): - def find(self, xpath): + def find(self, xpath, exclude=()): + """Finds nodes by matching a specified xpath. + + If exclude is passed, it should be a sequence of attribute names (its + default is empty). Any matches to the given xpath that also have any + attribute in this sequence will be excluded. + + For each childless node matching the xpath, or child of a node matching + the xpath, this yields a twople (name, attrs) where name is the + nodeName and attrs is a dict mapping the node's attribute's names to + their values. For attribute values that are not simple strings, the + nodeValue of the attribute node is used.""" elts = self.findNodes(xpath) - for elt in _iterateEach(e.dom.childNodes if e.dom.childNodes else (e.dom,) - for e in elts): + for elt in _iterateEach(e.dom.childNodes or (e.dom,) + for e in elts + if not any(a in e.dom.attributes + for a in exclude)): if elt.attributes: - yield (elt.nodeName, - dict((k, v if isinstance(v, basestring) else v.nodeValue) - for k, v in elt.attributes.items())) + yield elt.nodeName, _attrsFromDom(elt) class LocaleScanner (object): def __init__(self, name, nodes, root): + """Set up to scan data for a specified locale. + + First parameter is the name of the locale; it will be used in + error messages. Second is a tuple of DOM root-nodes of files + with locale data, later ones serving as fall-backs for data + missing in earlier ones. Third parameter is the root locale's + DOM node.""" self.name, self.nodes, self.base = name, nodes, root def find(self, xpath, default = None, draft = None): @@ -227,7 +230,7 @@ class LocaleScanner (object): def tagCodes(self): """Yields four tag codes - The tag codes are language, script, country and variant; an + The tag codes are language, script, territory and variant; an empty value for any of them indicates that no value was provided. The values are obtained from the primary file's top-level <identity> element. An Error is raised if any @@ -241,7 +244,7 @@ class LocaleScanner (object): except (KeyError, AttributeError): pass else: - raise Error('Alias to {}'.format(source)) + raise Error(f'Alias to {source}') ids = root.findUniqueChild('identity') for code in ('language', 'script', 'territory', 'variant'): @@ -259,12 +262,12 @@ class LocaleScanner (object): """Fetches currency data for this locale. Single argument, isoCode, is the ISO currency code for the - currency in use in the country. See also numericData, which + currency in use in the territory. See also numericData, which includes some currency formats. """ if isoCode: - stem = 'numbers/currencies/currency[{}]/'.format(isoCode) - symbol = self.find(stem + 'symbol', '') + stem = f'numbers/currencies/currency[{isoCode}]/' + symbol = self.find(f'{stem}symbol', '') name = self.__currencyDisplayName(stem) else: symbol = name = '' @@ -276,31 +279,38 @@ class LocaleScanner (object): First argument, lookup, is a callable that maps a numbering system's name to certain data about the system, as a mapping; - we expect this to have u'digits' as a key. + we expect this to have 'digits' as a key. """ system = self.find('numbers/defaultNumberingSystem') - stem = 'numbers/symbols[numberSystem={}]/'.format(system) - decimal = self.find(stem + 'decimal') - group = self.find(stem + 'group') - assert decimal != group, (self.name, system, decimal) + stem = f'numbers/symbols[numberSystem={system}]/' + decimal = self.find(f'{stem}decimal') + group = self.find(f'{stem}group') + if decimal == group: + # mn_Mong_MN @v43 :-( + clean = Node.draftScore('approved') + decimal = self.find(f'{stem}decimal', draft=clean) + group = self.find(f'{stem}group', draft=clean) + assert decimal != group, (self.name, system, decimal) + yield 'decimal', decimal yield 'group', group - yield 'percent', self.find(stem + 'percentSign') - yield 'list', self.find(stem + 'list') - yield 'exp', self.find(stem + 'exponential') + yield 'percent', self.find(f'{stem}percentSign') + yield 'list', self.find(f'{stem}list') + yield 'exp', self.find(f'{stem}exponential') yield 'groupSizes', self.__numberGrouping(system) digits = lookup(system)['digits'] assert len(digits) == 10 zero = digits[0] # Qt's number-formatting code assumes digits are consecutive - # (except Suzhou, CLDR's hanidec - see QTBUG-85409): + # (except Suzhou - see QTBUG-85409 - which shares its zero + # with CLDR's very-non-contiguous hanidec): assert all(ord(c) == i + (0x3020 if ord(zero) == 0x3007 else ord(zero)) for i, c in enumerate(digits[1:], 1)) yield 'zero', zero - plus = self.find(stem + 'plusSign') - minus = self.find(stem + 'minusSign') + plus = self.find(f'{stem}plusSign') + minus = self.find(f'{stem}minusSign') yield 'plus', plus yield 'minus', minus @@ -308,11 +318,11 @@ class LocaleScanner (object): xpath = 'numbers/currencyFormats/currencyFormatLength/currencyFormat[accounting]/pattern' try: money = self.find(xpath.replace('Formats/', - 'Formats[numberSystem={}]/'.format(system))) + f'Formats[numberSystem={system}]/')) except Error: money = self.find(xpath) money = self.__currencyFormats(money, plus, minus) - yield 'currencyFormat', money.next() + yield 'currencyFormat', next(money) neg = '' for it in money: assert not neg, 'There should be at most one more pattern' @@ -322,12 +332,12 @@ class LocaleScanner (object): def textPatternData(self): for key in ('quotationStart', 'alternateQuotationEnd', 'quotationEnd', 'alternateQuotationStart'): - yield key, self.find('delimiters/' + key) + yield key, self.find(f'delimiters/{key}') for key in ('start', 'middle', 'end'): - yield ('listPatternPart' + key.capitalize(), + yield (f'listPatternPart{key.capitalize()}', self.__fromLdmlListPattern(self.find( - 'listPatterns/listPattern/listPatternPart[{}]'.format(key)))) + f'listPatterns/listPattern/listPatternPart[{key}]'))) yield ('listPatternPartTwo', self.__fromLdmlListPattern(self.find( 'listPatterns/listPattern/listPatternPart[2]'))) @@ -335,28 +345,26 @@ class LocaleScanner (object): stem = 'dates/calendars/calendar[gregorian]/' # TODO: is wide really the right width to use here ? # abbreviated might be an option ... or try both ? - meridiem = stem + 'dayPeriods/dayPeriodContext[format]/dayPeriodWidth[wide]/' + meridiem = f'{stem}dayPeriods/dayPeriodContext[format]/dayPeriodWidth[wide]/' for key in ('am', 'pm'): - yield key, self.find(meridiem + 'dayPeriod[{}]'.format(key), + yield key, self.find(f'{meridiem}dayPeriod[{key}]', draft = Node.draftScore('contributed')) for pair in (('long', 'full'), ('short', 'short')): for key in ('time', 'date'): - yield (pair[0] + key.capitalize() + 'Format', + yield (f'{pair[0]}{key.capitalize()}Format', convert_date(self.find( - stem + '{}Formats/{}FormatLength[{}]/{}Format/pattern'.format( - key, key, pair[1], key)))) + f'{stem}{key}Formats/{key}FormatLength[{pair[1]}]/{key}Format/pattern'))) - def endonyms(self, language, script, country, variant): + def endonyms(self, language, script, territory, variant): # TODO: take variant into account ? - for seq in ((language, script, country), - (language, script), (language, country), (language,)): + for seq in ((language, script, territory), + (language, script), (language, territory), (language,)): if not all(seq): continue try: yield ('languageEndonym', - self.find('localeDisplayNames/languages/language[{}]' - .format('_'.join(seq)))) + self.find(f'localeDisplayNames/languages/language[{"_".join(seq)}]')) except Error: pass else: @@ -365,9 +373,8 @@ class LocaleScanner (object): # grumble(failed to find endonym for language) yield 'languageEndonym', '' - yield ('countryEndonym', - self.find('localeDisplayNames/territories/territory[{}]' - .format(country), '')) + yield ('territoryEndonym', + self.find(f'localeDisplayNames/territories/territory[{territory}]', '')) def unitData(self): yield ('byte_unit', @@ -386,20 +393,20 @@ class LocaleScanner (object): def calendarNames(self, calendars): namings = self.__nameForms for cal in calendars: - stem = 'dates/calendars/calendar[' + cal + ']/months/' + stem = f'dates/calendars/calendar[{cal}]/months/' for key, mode, size in namings: - prop = 'monthContext[' + mode + ']/monthWidth[' + size + ']/' - yield (key + 'Months_' + cal, - ';'.join(self.find(stem + prop + 'month[{}]'.format(i)) + prop = f'monthContext[{mode}]/monthWidth[{size}]/' + yield (f'{key}Months_{cal}', + ';'.join(self.find(f'{stem}{prop}month[{i}]') for i in range(1, 13))) # Day data (for Gregorian, at least): stem = 'dates/calendars/calendar[gregorian]/days/' days = ('sun', 'mon', 'tue', 'wed', 'thu', 'fri', 'sat') for (key, mode, size) in namings: - prop = 'dayContext[' + mode + ']/dayWidth[' + size + ']/day' - yield (key + 'Days', - ';'.join(self.find(stem + prop + '[' + day + ']') + prop = f'dayContext[{mode}]/dayWidth[{size}]/day' + yield (f'{key}Days', + ';'.join(self.find(f'{stem}{prop}[{day}]') for day in days)) # Implementation details @@ -410,10 +417,10 @@ class LocaleScanner (object): ('long', 'format', 'wide'), ('short', 'format', 'abbreviated'), ('narrow', 'format', 'narrow'), - ) # Used for month and day names + ) # Used for month and day names def __find(self, xpath): - retries = [ xpath.split('/') ] + retries, foundNone = [ xpath.split('/') ], True while retries: tags, elts, roots = retries.pop(), self.nodes, (self.base.root,) for selector in tags: @@ -423,6 +430,9 @@ class LocaleScanner (object): break else: # Found matching elements + elts = tuple(self.__skipInheritors(elts)) + if elts: + foundNone = False # Possibly filter elts to prefer the least drafty ? for elt in elts: yield elt @@ -442,29 +452,42 @@ class LocaleScanner (object): if not roots: if retries: # Let outer loop fall back on an alias path: break - sought = '/'.join(tags) - if sought != xpath: - sought += ' (for {})'.format(xpath) - raise Error('All lack child {} for {} in {}'.format( - selector, sought, self.name)) + if foundNone: + sought = '/'.join(tags) + if sought != xpath: + sought += f' (for {xpath})' + raise Error(f'All lack child {selector} for {sought} in {self.name}') else: # Found matching elements + roots = tuple(self.__skipInheritors(roots)) + if roots: + foundNone = False for elt in roots: yield elt - sought = '/'.join(tags) - if sought != xpath: - sought += ' (for {})'.format(xpath) - raise Error('No {} in {}'.format(sought, self.name)) + if foundNone: + sought = '/'.join(tags) + if sought != xpath: + sought += f' (for {xpath})' + raise Error(f'No {sought} in {self.name}') + + @staticmethod + def __skipInheritors(elts): + for elt in elts: + try: + if elt.dom.firstChild.nodeValue != INHERIT: + yield elt + except (AttributeError, KeyError): + yield elt def __currencyDisplayName(self, stem): try: return self.find(stem + 'displayName') except Error: pass - for x in ('zero', 'one', 'two', 'few', 'many', 'other'): + for x in ('zero', 'one', 'two', 'few', 'many', 'other'): try: - return self.find(stem + 'displayName[count={}]'.format(x)) + return self.find(f'{stem}displayName[count={x}]') except Error: pass return '' @@ -474,10 +497,10 @@ class LocaleScanner (object): # (even for unitLength[narrow]) instead of kB (etc.), so # prefer any unitPattern provided, but prune its placeholder: for size in ('short', 'narrow'): # TODO: reverse order ? - stem = 'units/unitLength[{}]/unit[digital-{}byte]/'.format(size + keySuffix, quantify) + stem = f'units/unitLength[{size}{keySuffix}]/unit[digital-{quantify}byte]/' for count in ('many', 'few', 'two', 'other', 'zero', 'one'): try: - ans = self.find(stem + 'unitPattern[count={}]'.format(count)) + ans = self.find(f'{stem}unitPattern[count={count}]') except Error: continue @@ -490,7 +513,7 @@ class LocaleScanner (object): return ans try: - return self.find(stem + 'displayName') + return self.find(f'{stem}displayName') except Error: pass @@ -518,10 +541,10 @@ class LocaleScanner (object): if cache: byte = cache.pop() if all(byte == k for k in cache): - suffix = 'i' + byte + suffix = f'i{byte}' for q in siQuantifiers: # Those don't (yet, v36) exist in CLDR, so we always get the fall-back: - yield self.__findUnit(keySuffix, q[:2], q[0].upper() + suffix) + yield self.__findUnit(keySuffix, q[:2], f'{q[0].upper()}{suffix}') else: # first call tail = suffix = suffix or 'B' for q in siQuantifiers: @@ -556,8 +579,8 @@ class LocaleScanner (object): elsewhere).""" top = int(self.find('numbers/minimumGroupingDigits')) assert top < 4, top # We store it in a 2-bit field - grouping = self.find('numbers/decimalFormats[numberSystem=' - + system + ']/decimalFormatLength/decimalFormat/pattern') + grouping = self.find(f'numbers/decimalFormats[numberSystem={system}]/' + 'decimalFormatLength/decimalFormat/pattern') groups = grouping.split('.')[0].split(',')[-3:] assert all(len(x) < 8 for x in groups[-2:]), grouping # we store them in 3-bit fields if len(groups) > 2: @@ -580,7 +603,7 @@ class LocaleScanner (object): # According to http://www.unicode.org/reports/tr35/#Number_Format_Patterns # there can be doubled or trippled currency sign, however none of the # locales use that. - p = p.replace(u'\xa4', "%2") + p = p.replace('\xa4', "%2") # Single quote goes away, but double goes to single: p = p.replace("''", '###').replace("'", '').replace('###', "'") # Use number system's signs: |