summaryrefslogtreecommitdiffstats
path: root/util/locale_database/ldml.py
diff options
context:
space:
mode:
Diffstat (limited to 'util/locale_database/ldml.py')
-rw-r--r--util/locale_database/ldml.py215
1 files changed, 119 insertions, 96 deletions
diff --git a/util/locale_database/ldml.py b/util/locale_database/ldml.py
index 110e5b7573..b94c242172 100644
--- a/util/locale_database/ldml.py
+++ b/util/locale_database/ldml.py
@@ -1,30 +1,5 @@
-#############################################################################
-##
-## Copyright (C) 2020 The Qt Company Ltd.
-## Contact: https://www.qt.io/licensing/
-##
-## This file is part of the test suite of the Qt Toolkit.
-##
-## $QT_BEGIN_LICENSE:GPL-EXCEPT$
-## Commercial License Usage
-## Licensees holding valid commercial Qt licenses may use this file in
-## accordance with the commercial license agreement provided with the
-## Software or, alternatively, in accordance with the terms contained in
-## a written agreement between you and The Qt Company. For licensing terms
-## and conditions see https://www.qt.io/terms-conditions. For further
-## information use the contact form at https://www.qt.io/contact-us.
-##
-## GNU General Public License Usage
-## Alternatively, this file may be used under the terms of the GNU
-## General Public License version 3 as published by the Free Software
-## Foundation with exceptions as appearing in the file LICENSE.GPL3-EXCEPT
-## included in the packaging of this file. Please review the following
-## information to ensure the GNU General Public License requirements will
-## be met: https://www.gnu.org/licenses/gpl-3.0.html.
-##
-## $QT_END_LICENSE$
-##
-#############################################################################
+# Copyright (C) 2020 The Qt Company Ltd.
+# SPDX-License-Identifier: LicenseRef-Qt-Commercial OR GPL-3.0-only WITH Qt-GPL-exception-1.0
"""Parsing the Locale Data Markup Language
It's an XML format, so the raw parsing of XML is, of course, delegated
@@ -46,6 +21,13 @@ See individual classes for further detail.
from localetools import Error
from dateconverter import convert_date
+# The github version of CLDR uses '↑↑↑' to indicate "inherit"
+INHERIT = '↑↑↑'
+
+def _attrsFromDom(dom):
+ return { k: (v if isinstance(v, str) else v.nodeValue)
+ for k, v in dom.attributes.items() }
+
class Node (object):
"""Wrapper for an arbitrary DOM node.
@@ -75,6 +57,9 @@ class Node (object):
else:
self.draft = max(draft, self.draftScore(attr))
+ def attributes(self):
+ return _attrsFromDom(self.dom)
+
def findAllChildren(self, tag, wanted = None, allDull = False):
"""All children that do have the given tag and attributes.
@@ -124,7 +109,7 @@ class Node (object):
one."""
seq = self.findAllChildren(tag)
try:
- node = seq.next()
+ node = next(seq)
except StopIteration:
raise Error('No child found where one was expected', tag)
for it in seq:
@@ -191,17 +176,35 @@ class XmlScanner (object):
return elts
class Supplement (XmlScanner):
- def find(self, xpath):
+ def find(self, xpath, exclude=()):
+ """Finds nodes by matching a specified xpath.
+
+ If exclude is passed, it should be a sequence of attribute names (its
+ default is empty). Any matches to the given xpath that also have any
+ attribute in this sequence will be excluded.
+
+ For each childless node matching the xpath, or child of a node matching
+ the xpath, this yields a twople (name, attrs) where name is the
+ nodeName and attrs is a dict mapping the node's attribute's names to
+ their values. For attribute values that are not simple strings, the
+ nodeValue of the attribute node is used."""
elts = self.findNodes(xpath)
- for elt in _iterateEach(e.dom.childNodes if e.dom.childNodes else (e.dom,)
- for e in elts):
+ for elt in _iterateEach(e.dom.childNodes or (e.dom,)
+ for e in elts
+ if not any(a in e.dom.attributes
+ for a in exclude)):
if elt.attributes:
- yield (elt.nodeName,
- dict((k, v if isinstance(v, basestring) else v.nodeValue)
- for k, v in elt.attributes.items()))
+ yield elt.nodeName, _attrsFromDom(elt)
class LocaleScanner (object):
def __init__(self, name, nodes, root):
+ """Set up to scan data for a specified locale.
+
+ First parameter is the name of the locale; it will be used in
+ error messages. Second is a tuple of DOM root-nodes of files
+ with locale data, later ones serving as fall-backs for data
+ missing in earlier ones. Third parameter is the root locale's
+ DOM node."""
self.name, self.nodes, self.base = name, nodes, root
def find(self, xpath, default = None, draft = None):
@@ -227,7 +230,7 @@ class LocaleScanner (object):
def tagCodes(self):
"""Yields four tag codes
- The tag codes are language, script, country and variant; an
+ The tag codes are language, script, territory and variant; an
empty value for any of them indicates that no value was
provided. The values are obtained from the primary file's
top-level <identity> element. An Error is raised if any
@@ -241,7 +244,7 @@ class LocaleScanner (object):
except (KeyError, AttributeError):
pass
else:
- raise Error('Alias to {}'.format(source))
+ raise Error(f'Alias to {source}')
ids = root.findUniqueChild('identity')
for code in ('language', 'script', 'territory', 'variant'):
@@ -259,12 +262,12 @@ class LocaleScanner (object):
"""Fetches currency data for this locale.
Single argument, isoCode, is the ISO currency code for the
- currency in use in the country. See also numericData, which
+ currency in use in the territory. See also numericData, which
includes some currency formats.
"""
if isoCode:
- stem = 'numbers/currencies/currency[{}]/'.format(isoCode)
- symbol = self.find(stem + 'symbol', '')
+ stem = f'numbers/currencies/currency[{isoCode}]/'
+ symbol = self.find(f'{stem}symbol', '')
name = self.__currencyDisplayName(stem)
else:
symbol = name = ''
@@ -276,31 +279,38 @@ class LocaleScanner (object):
First argument, lookup, is a callable that maps a numbering
system's name to certain data about the system, as a mapping;
- we expect this to have u'digits' as a key.
+ we expect this to have 'digits' as a key.
"""
system = self.find('numbers/defaultNumberingSystem')
- stem = 'numbers/symbols[numberSystem={}]/'.format(system)
- decimal = self.find(stem + 'decimal')
- group = self.find(stem + 'group')
- assert decimal != group, (self.name, system, decimal)
+ stem = f'numbers/symbols[numberSystem={system}]/'
+ decimal = self.find(f'{stem}decimal')
+ group = self.find(f'{stem}group')
+ if decimal == group:
+ # mn_Mong_MN @v43 :-(
+ clean = Node.draftScore('approved')
+ decimal = self.find(f'{stem}decimal', draft=clean)
+ group = self.find(f'{stem}group', draft=clean)
+ assert decimal != group, (self.name, system, decimal)
+
yield 'decimal', decimal
yield 'group', group
- yield 'percent', self.find(stem + 'percentSign')
- yield 'list', self.find(stem + 'list')
- yield 'exp', self.find(stem + 'exponential')
+ yield 'percent', self.find(f'{stem}percentSign')
+ yield 'list', self.find(f'{stem}list')
+ yield 'exp', self.find(f'{stem}exponential')
yield 'groupSizes', self.__numberGrouping(system)
digits = lookup(system)['digits']
assert len(digits) == 10
zero = digits[0]
# Qt's number-formatting code assumes digits are consecutive
- # (except Suzhou, CLDR's hanidec - see QTBUG-85409):
+ # (except Suzhou - see QTBUG-85409 - which shares its zero
+ # with CLDR's very-non-contiguous hanidec):
assert all(ord(c) == i + (0x3020 if ord(zero) == 0x3007 else ord(zero))
for i, c in enumerate(digits[1:], 1))
yield 'zero', zero
- plus = self.find(stem + 'plusSign')
- minus = self.find(stem + 'minusSign')
+ plus = self.find(f'{stem}plusSign')
+ minus = self.find(f'{stem}minusSign')
yield 'plus', plus
yield 'minus', minus
@@ -308,11 +318,11 @@ class LocaleScanner (object):
xpath = 'numbers/currencyFormats/currencyFormatLength/currencyFormat[accounting]/pattern'
try:
money = self.find(xpath.replace('Formats/',
- 'Formats[numberSystem={}]/'.format(system)))
+ f'Formats[numberSystem={system}]/'))
except Error:
money = self.find(xpath)
money = self.__currencyFormats(money, plus, minus)
- yield 'currencyFormat', money.next()
+ yield 'currencyFormat', next(money)
neg = ''
for it in money:
assert not neg, 'There should be at most one more pattern'
@@ -322,12 +332,12 @@ class LocaleScanner (object):
def textPatternData(self):
for key in ('quotationStart', 'alternateQuotationEnd',
'quotationEnd', 'alternateQuotationStart'):
- yield key, self.find('delimiters/' + key)
+ yield key, self.find(f'delimiters/{key}')
for key in ('start', 'middle', 'end'):
- yield ('listPatternPart' + key.capitalize(),
+ yield (f'listPatternPart{key.capitalize()}',
self.__fromLdmlListPattern(self.find(
- 'listPatterns/listPattern/listPatternPart[{}]'.format(key))))
+ f'listPatterns/listPattern/listPatternPart[{key}]')))
yield ('listPatternPartTwo',
self.__fromLdmlListPattern(self.find(
'listPatterns/listPattern/listPatternPart[2]')))
@@ -335,28 +345,26 @@ class LocaleScanner (object):
stem = 'dates/calendars/calendar[gregorian]/'
# TODO: is wide really the right width to use here ?
# abbreviated might be an option ... or try both ?
- meridiem = stem + 'dayPeriods/dayPeriodContext[format]/dayPeriodWidth[wide]/'
+ meridiem = f'{stem}dayPeriods/dayPeriodContext[format]/dayPeriodWidth[wide]/'
for key in ('am', 'pm'):
- yield key, self.find(meridiem + 'dayPeriod[{}]'.format(key),
+ yield key, self.find(f'{meridiem}dayPeriod[{key}]',
draft = Node.draftScore('contributed'))
for pair in (('long', 'full'), ('short', 'short')):
for key in ('time', 'date'):
- yield (pair[0] + key.capitalize() + 'Format',
+ yield (f'{pair[0]}{key.capitalize()}Format',
convert_date(self.find(
- stem + '{}Formats/{}FormatLength[{}]/{}Format/pattern'.format(
- key, key, pair[1], key))))
+ f'{stem}{key}Formats/{key}FormatLength[{pair[1]}]/{key}Format/pattern')))
- def endonyms(self, language, script, country, variant):
+ def endonyms(self, language, script, territory, variant):
# TODO: take variant into account ?
- for seq in ((language, script, country),
- (language, script), (language, country), (language,)):
+ for seq in ((language, script, territory),
+ (language, script), (language, territory), (language,)):
if not all(seq):
continue
try:
yield ('languageEndonym',
- self.find('localeDisplayNames/languages/language[{}]'
- .format('_'.join(seq))))
+ self.find(f'localeDisplayNames/languages/language[{"_".join(seq)}]'))
except Error:
pass
else:
@@ -365,9 +373,8 @@ class LocaleScanner (object):
# grumble(failed to find endonym for language)
yield 'languageEndonym', ''
- yield ('countryEndonym',
- self.find('localeDisplayNames/territories/territory[{}]'
- .format(country), ''))
+ yield ('territoryEndonym',
+ self.find(f'localeDisplayNames/territories/territory[{territory}]', ''))
def unitData(self):
yield ('byte_unit',
@@ -386,20 +393,20 @@ class LocaleScanner (object):
def calendarNames(self, calendars):
namings = self.__nameForms
for cal in calendars:
- stem = 'dates/calendars/calendar[' + cal + ']/months/'
+ stem = f'dates/calendars/calendar[{cal}]/months/'
for key, mode, size in namings:
- prop = 'monthContext[' + mode + ']/monthWidth[' + size + ']/'
- yield (key + 'Months_' + cal,
- ';'.join(self.find(stem + prop + 'month[{}]'.format(i))
+ prop = f'monthContext[{mode}]/monthWidth[{size}]/'
+ yield (f'{key}Months_{cal}',
+ ';'.join(self.find(f'{stem}{prop}month[{i}]')
for i in range(1, 13)))
# Day data (for Gregorian, at least):
stem = 'dates/calendars/calendar[gregorian]/days/'
days = ('sun', 'mon', 'tue', 'wed', 'thu', 'fri', 'sat')
for (key, mode, size) in namings:
- prop = 'dayContext[' + mode + ']/dayWidth[' + size + ']/day'
- yield (key + 'Days',
- ';'.join(self.find(stem + prop + '[' + day + ']')
+ prop = f'dayContext[{mode}]/dayWidth[{size}]/day'
+ yield (f'{key}Days',
+ ';'.join(self.find(f'{stem}{prop}[{day}]')
for day in days))
# Implementation details
@@ -410,10 +417,10 @@ class LocaleScanner (object):
('long', 'format', 'wide'),
('short', 'format', 'abbreviated'),
('narrow', 'format', 'narrow'),
- ) # Used for month and day names
+ ) # Used for month and day names
def __find(self, xpath):
- retries = [ xpath.split('/') ]
+ retries, foundNone = [ xpath.split('/') ], True
while retries:
tags, elts, roots = retries.pop(), self.nodes, (self.base.root,)
for selector in tags:
@@ -423,6 +430,9 @@ class LocaleScanner (object):
break
else: # Found matching elements
+ elts = tuple(self.__skipInheritors(elts))
+ if elts:
+ foundNone = False
# Possibly filter elts to prefer the least drafty ?
for elt in elts:
yield elt
@@ -442,29 +452,42 @@ class LocaleScanner (object):
if not roots:
if retries: # Let outer loop fall back on an alias path:
break
- sought = '/'.join(tags)
- if sought != xpath:
- sought += ' (for {})'.format(xpath)
- raise Error('All lack child {} for {} in {}'.format(
- selector, sought, self.name))
+ if foundNone:
+ sought = '/'.join(tags)
+ if sought != xpath:
+ sought += f' (for {xpath})'
+ raise Error(f'All lack child {selector} for {sought} in {self.name}')
else: # Found matching elements
+ roots = tuple(self.__skipInheritors(roots))
+ if roots:
+ foundNone = False
for elt in roots:
yield elt
- sought = '/'.join(tags)
- if sought != xpath:
- sought += ' (for {})'.format(xpath)
- raise Error('No {} in {}'.format(sought, self.name))
+ if foundNone:
+ sought = '/'.join(tags)
+ if sought != xpath:
+ sought += f' (for {xpath})'
+ raise Error(f'No {sought} in {self.name}')
+
+ @staticmethod
+ def __skipInheritors(elts):
+ for elt in elts:
+ try:
+ if elt.dom.firstChild.nodeValue != INHERIT:
+ yield elt
+ except (AttributeError, KeyError):
+ yield elt
def __currencyDisplayName(self, stem):
try:
return self.find(stem + 'displayName')
except Error:
pass
- for x in ('zero', 'one', 'two', 'few', 'many', 'other'):
+ for x in ('zero', 'one', 'two', 'few', 'many', 'other'):
try:
- return self.find(stem + 'displayName[count={}]'.format(x))
+ return self.find(f'{stem}displayName[count={x}]')
except Error:
pass
return ''
@@ -474,10 +497,10 @@ class LocaleScanner (object):
# (even for unitLength[narrow]) instead of kB (etc.), so
# prefer any unitPattern provided, but prune its placeholder:
for size in ('short', 'narrow'): # TODO: reverse order ?
- stem = 'units/unitLength[{}]/unit[digital-{}byte]/'.format(size + keySuffix, quantify)
+ stem = f'units/unitLength[{size}{keySuffix}]/unit[digital-{quantify}byte]/'
for count in ('many', 'few', 'two', 'other', 'zero', 'one'):
try:
- ans = self.find(stem + 'unitPattern[count={}]'.format(count))
+ ans = self.find(f'{stem}unitPattern[count={count}]')
except Error:
continue
@@ -490,7 +513,7 @@ class LocaleScanner (object):
return ans
try:
- return self.find(stem + 'displayName')
+ return self.find(f'{stem}displayName')
except Error:
pass
@@ -518,10 +541,10 @@ class LocaleScanner (object):
if cache:
byte = cache.pop()
if all(byte == k for k in cache):
- suffix = 'i' + byte
+ suffix = f'i{byte}'
for q in siQuantifiers:
# Those don't (yet, v36) exist in CLDR, so we always get the fall-back:
- yield self.__findUnit(keySuffix, q[:2], q[0].upper() + suffix)
+ yield self.__findUnit(keySuffix, q[:2], f'{q[0].upper()}{suffix}')
else: # first call
tail = suffix = suffix or 'B'
for q in siQuantifiers:
@@ -556,8 +579,8 @@ class LocaleScanner (object):
elsewhere)."""
top = int(self.find('numbers/minimumGroupingDigits'))
assert top < 4, top # We store it in a 2-bit field
- grouping = self.find('numbers/decimalFormats[numberSystem='
- + system + ']/decimalFormatLength/decimalFormat/pattern')
+ grouping = self.find(f'numbers/decimalFormats[numberSystem={system}]/'
+ 'decimalFormatLength/decimalFormat/pattern')
groups = grouping.split('.')[0].split(',')[-3:]
assert all(len(x) < 8 for x in groups[-2:]), grouping # we store them in 3-bit fields
if len(groups) > 2:
@@ -580,7 +603,7 @@ class LocaleScanner (object):
# According to http://www.unicode.org/reports/tr35/#Number_Format_Patterns
# there can be doubled or trippled currency sign, however none of the
# locales use that.
- p = p.replace(u'\xa4', "%2")
+ p = p.replace('\xa4', "%2")
# Single quote goes away, but double goes to single:
p = p.replace("''", '###').replace("'", '').replace('###', "'")
# Use number system's signs: