1 files changed, 119 insertions, 96 deletions
diff --git a/util/locale_database/ldml.py b/util/locale_database/ldml.py
index 110e5b7573..b94c242172 100644
--- a/util/locale_database/ldml.py
+++ b/util/locale_database/ldml.py
@@ -1,30 +1,5 @@
-#############################################################################
-##
-## Copyright (C) 2020 The Qt Company Ltd.
-## Contact: https://www.qt.io/licensing/
-##
-## This file is part of the test suite of the Qt Toolkit.
-##
-## $QT_BEGIN_LICENSE:GPL-EXCEPT$
-## Commercial License Usage
-## Licensees holding valid commercial Qt licenses may use this file in
-## accordance with the commercial license agreement provided with the
-## Software or, alternatively, in accordance with the terms contained in
-## a written agreement between you and The Qt Company. For licensing terms
-## and conditions see https://www.qt.io/terms-conditions. For further
-## information use the contact form at https://www.qt.io/contact-us.
-##
-## GNU General Public License Usage
-## Alternatively, this file may be used under the terms of the GNU
-## General Public License version 3 as published by the Free Software
-## Foundation with exceptions as appearing in the file LICENSE.GPL3-EXCEPT
-## included in the packaging of this file. Please review the following
-## information to ensure the GNU General Public License requirements will
-## be met: https://www.gnu.org/licenses/gpl-3.0.html.
-##
-## $QT_END_LICENSE$
-##
-#############################################################################
+# Copyright (C) 2020 The Qt Company Ltd.
+# SPDX-License-Identifier: LicenseRef-Qt-Commercial OR GPL-3.0-only WITH Qt-GPL-exception-1.0
 """Parsing the Locale Data Markup Language
 
 It's an XML format, so the raw parsing of XML is, of course, delegated
@@ -46,6 +21,13 @@ See individual classes for further detail.
 from localetools import Error
 from dateconverter import convert_date
 
+# The github version of CLDR uses '↑↑↑' to indicate "inherit"
+INHERIT = '↑↑↑'
+
+def _attrsFromDom(dom):
+    return { k: (v if isinstance(v, str) else v.nodeValue)
+             for k, v in dom.attributes.items() }
+
 class Node (object):
     """Wrapper for an arbitrary DOM node.
 
@@ -75,6 +57,9 @@ class Node (object):
         else:
             self.draft = max(draft, self.draftScore(attr))
 
+    def attributes(self):
+        return _attrsFromDom(self.dom)
+
     def findAllChildren(self, tag, wanted = None, allDull = False):
         """All children that do have the given tag and attributes.
 
@@ -124,7 +109,7 @@ class Node (object):
         one."""
         seq = self.findAllChildren(tag)
         try:
-            node = seq.next()
+            node = next(seq)
         except StopIteration:
             raise Error('No child found where one was expected', tag)
         for it in seq:
@@ -191,17 +176,35 @@ class XmlScanner (object):
         return elts
 
 class Supplement (XmlScanner):
-    def find(self, xpath):
+    def find(self, xpath, exclude=()):
+        """Finds nodes by matching a specified xpath.
+
+        If exclude is passed, it should be a sequence of attribute names (its
+        default is empty). Any matches to the given xpath that also have any
+        attribute in this sequence will be excluded.
+
+        For each childless node matching the xpath, or child of a node matching
+        the xpath, this yields a twople (name, attrs) where name is the
+        nodeName and attrs is a dict mapping the node's attribute's names to
+        their values. For attribute values that are not simple strings, the
+        nodeValue of the attribute node is used."""
         elts = self.findNodes(xpath)
-        for elt in _iterateEach(e.dom.childNodes if e.dom.childNodes else (e.dom,)
-                                for e in elts):
+        for elt in _iterateEach(e.dom.childNodes or (e.dom,)
+                                for e in elts
+                                if not any(a in e.dom.attributes
+                                           for a in exclude)):
             if elt.attributes:
-                yield (elt.nodeName,
-                       dict((k, v if isinstance(v, basestring) else v.nodeValue)
-                            for k, v in elt.attributes.items()))
+                yield elt.nodeName, _attrsFromDom(elt)
 
 class LocaleScanner (object):
     def __init__(self, name, nodes, root):
+        """Set up to scan data for a specified locale.
+
+        First parameter is the name of the locale; it will be used in
+        error messages. Second is a tuple of DOM root-nodes of files
+        with locale data, later ones serving as fall-backs for data
+        missing in earlier ones. Third parameter is the root locale's
+        DOM node."""
         self.name, self.nodes, self.base = name, nodes, root
 
     def find(self, xpath, default = None, draft = None):
@@ -227,7 +230,7 @@ class LocaleScanner (object):
     def tagCodes(self):
         """Yields four tag codes
 
-        The tag codes are language, script, country and variant; an
+        The tag codes are language, script, territory and variant; an
         empty value for any of them indicates that no value was
         provided.  The values are obtained from the primary file's
         top-level <identity> element.  An Error is raised if any
@@ -241,7 +244,7 @@ class LocaleScanner (object):
             except (KeyError, AttributeError):
                 pass
             else:
-                raise Error('Alias to {}'.format(source))
+                raise Error(f'Alias to {source}')
 
         ids = root.findUniqueChild('identity')
         for code in ('language', 'script', 'territory', 'variant'):
@@ -259,12 +262,12 @@ class LocaleScanner (object):
         """Fetches currency data for this locale.
 
         Single argument, isoCode, is the ISO currency code for the
-        currency in use in the country. See also numericData, which
+        currency in use in the territory. See also numericData, which
         includes some currency formats.
         """
         if isoCode:
-            stem = 'numbers/currencies/currency[{}]/'.format(isoCode)
-            symbol = self.find(stem + 'symbol', '')
+            stem = f'numbers/currencies/currency[{isoCode}]/'
+            symbol = self.find(f'{stem}symbol', '')
             name = self.__currencyDisplayName(stem)
         else:
             symbol = name = ''
@@ -276,31 +279,38 @@ class LocaleScanner (object):
 
         First argument, lookup, is a callable that maps a numbering
         system's name to certain data about the system, as a mapping;
-        we expect this to have u'digits' as a key.
+        we expect this to have 'digits' as a key.
         """
         system = self.find('numbers/defaultNumberingSystem')
-        stem = 'numbers/symbols[numberSystem={}]/'.format(system)
-        decimal = self.find(stem + 'decimal')
-        group = self.find(stem + 'group')
-        assert decimal != group, (self.name, system, decimal)
+        stem = f'numbers/symbols[numberSystem={system}]/'
+        decimal = self.find(f'{stem}decimal')
+        group = self.find(f'{stem}group')
+        if decimal == group:
+            # mn_Mong_MN @v43 :-(
+            clean = Node.draftScore('approved')
+            decimal = self.find(f'{stem}decimal', draft=clean)
+            group = self.find(f'{stem}group', draft=clean)
+            assert decimal != group, (self.name, system, decimal)
+
         yield 'decimal', decimal
         yield 'group', group
-        yield 'percent', self.find(stem + 'percentSign')
-        yield 'list', self.find(stem + 'list')
-        yield 'exp', self.find(stem + 'exponential')
+        yield 'percent', self.find(f'{stem}percentSign')
+        yield 'list', self.find(f'{stem}list')
+        yield 'exp', self.find(f'{stem}exponential')
         yield 'groupSizes', self.__numberGrouping(system)
 
         digits = lookup(system)['digits']
         assert len(digits) == 10
         zero = digits[0]
         # Qt's number-formatting code assumes digits are consecutive
-        # (except Suzhou, CLDR's hanidec - see QTBUG-85409):
+        # (except Suzhou - see QTBUG-85409 - which shares its zero
+        # with CLDR's very-non-contiguous hanidec):
         assert all(ord(c) == i + (0x3020 if ord(zero) == 0x3007 else ord(zero))
                    for i, c in enumerate(digits[1:], 1))
         yield 'zero', zero
 
-        plus = self.find(stem + 'plusSign')
-        minus = self.find(stem + 'minusSign')
+        plus = self.find(f'{stem}plusSign')
+        minus = self.find(f'{stem}minusSign')
         yield 'plus', plus
         yield 'minus', minus
 
@@ -308,11 +318,11 @@ class LocaleScanner (object):
         xpath = 'numbers/currencyFormats/currencyFormatLength/currencyFormat[accounting]/pattern'
         try:
             money = self.find(xpath.replace('Formats/',
-                                            'Formats[numberSystem={}]/'.format(system)))
+                                            f'Formats[numberSystem={system}]/'))
         except Error:
             money = self.find(xpath)
         money = self.__currencyFormats(money, plus, minus)
-        yield 'currencyFormat', money.next()
+        yield 'currencyFormat', next(money)
         neg = ''
         for it in money:
             assert not neg, 'There should be at most one more pattern'
@@ -322,12 +332,12 @@ class LocaleScanner (object):
     def textPatternData(self):
         for key in ('quotationStart', 'alternateQuotationEnd',
                     'quotationEnd', 'alternateQuotationStart'):
-            yield key, self.find('delimiters/' + key)
+            yield key, self.find(f'delimiters/{key}')
 
         for key in ('start', 'middle', 'end'):
-            yield ('listPatternPart' + key.capitalize(),
+            yield (f'listPatternPart{key.capitalize()}',
                    self.__fromLdmlListPattern(self.find(
-                        'listPatterns/listPattern/listPatternPart[{}]'.format(key))))
+                        f'listPatterns/listPattern/listPatternPart[{key}]')))
         yield ('listPatternPartTwo',
                self.__fromLdmlListPattern(self.find(
                     'listPatterns/listPattern/listPatternPart[2]')))
@@ -335,28 +345,26 @@ class LocaleScanner (object):
         stem = 'dates/calendars/calendar[gregorian]/'
         # TODO: is wide really the right width to use here ?
         # abbreviated might be an option ... or try both ?
-        meridiem = stem + 'dayPeriods/dayPeriodContext[format]/dayPeriodWidth[wide]/'
+        meridiem = f'{stem}dayPeriods/dayPeriodContext[format]/dayPeriodWidth[wide]/'
         for key in ('am', 'pm'):
-            yield key, self.find(meridiem + 'dayPeriod[{}]'.format(key),
+            yield key, self.find(f'{meridiem}dayPeriod[{key}]',
                                  draft = Node.draftScore('contributed'))
 
         for pair in (('long', 'full'), ('short', 'short')):
             for key in ('time', 'date'):
-                yield (pair[0] + key.capitalize() + 'Format',
+                yield (f'{pair[0]}{key.capitalize()}Format',
                        convert_date(self.find(
-                            stem + '{}Formats/{}FormatLength[{}]/{}Format/pattern'.format(
-                                key, key, pair[1], key))))
+                            f'{stem}{key}Formats/{key}FormatLength[{pair[1]}]/{key}Format/pattern')))
 
-    def endonyms(self, language, script, country, variant):
+    def endonyms(self, language, script, territory, variant):
         # TODO: take variant into account ?
-        for seq in ((language, script, country),
-                    (language, script), (language, country), (language,)):
+        for seq in ((language, script, territory),
+                    (language, script), (language, territory), (language,)):
             if not all(seq):
                 continue
             try:
                 yield ('languageEndonym',
-                       self.find('localeDisplayNames/languages/language[{}]'
-                                 .format('_'.join(seq))))
+                       self.find(f'localeDisplayNames/languages/language[{"_".join(seq)}]'))
             except Error:
                 pass
             else:
@@ -365,9 +373,8 @@ class LocaleScanner (object):
             # grumble(failed to find endonym for language)
             yield 'languageEndonym', ''
 
-        yield ('countryEndonym',
-               self.find('localeDisplayNames/territories/territory[{}]'
-                         .format(country), ''))
+        yield ('territoryEndonym',
+               self.find(f'localeDisplayNames/territories/territory[{territory}]', ''))
 
     def unitData(self):
         yield ('byte_unit',
@@ -386,20 +393,20 @@ class LocaleScanner (object):
     def calendarNames(self, calendars):
         namings = self.__nameForms
         for cal in calendars:
-            stem = 'dates/calendars/calendar[' + cal + ']/months/'
+            stem = f'dates/calendars/calendar[{cal}]/months/'
             for key, mode, size in namings:
-                prop = 'monthContext[' + mode + ']/monthWidth[' + size + ']/'
-                yield (key + 'Months_' + cal,
-                       ';'.join(self.find(stem + prop + 'month[{}]'.format(i))
+                prop = f'monthContext[{mode}]/monthWidth[{size}]/'
+                yield (f'{key}Months_{cal}',
+                       ';'.join(self.find(f'{stem}{prop}month[{i}]')
                                 for i in range(1, 13)))
 
         # Day data (for Gregorian, at least):
         stem = 'dates/calendars/calendar[gregorian]/days/'
         days = ('sun', 'mon', 'tue', 'wed', 'thu', 'fri', 'sat')
         for (key, mode, size) in namings:
-            prop = 'dayContext[' + mode + ']/dayWidth[' + size + ']/day'
-            yield (key + 'Days',
-                   ';'.join(self.find(stem + prop + '[' + day + ']')
+            prop = f'dayContext[{mode}]/dayWidth[{size}]/day'
+            yield (f'{key}Days',
+                   ';'.join(self.find(f'{stem}{prop}[{day}]')
                             for day in days))
 
     # Implementation details
@@ -410,10 +417,10 @@ class LocaleScanner (object):
         ('long', 'format', 'wide'),
         ('short', 'format', 'abbreviated'),
         ('narrow', 'format', 'narrow'),
-        ) # Used for month and day names
+    ) # Used for month and day names
 
     def __find(self, xpath):
-        retries = [ xpath.split('/') ]
+        retries, foundNone = [ xpath.split('/') ], True
         while retries:
             tags, elts, roots = retries.pop(), self.nodes, (self.base.root,)
             for selector in tags:
@@ -423,6 +430,9 @@ class LocaleScanner (object):
                     break
 
             else: # Found matching elements
+                elts = tuple(self.__skipInheritors(elts))
+                if elts:
+                    foundNone = False
                 # Possibly filter elts to prefer the least drafty ?
                 for elt in elts:
                     yield elt
@@ -442,29 +452,42 @@ class LocaleScanner (object):
                 if not roots:
                     if retries: # Let outer loop fall back on an alias path:
                         break
-                    sought = '/'.join(tags)
-                    if sought != xpath:
-                        sought += ' (for {})'.format(xpath)
-                    raise Error('All lack child {} for {} in {}'.format(
-                            selector, sought, self.name))
+                    if foundNone:
+                        sought = '/'.join(tags)
+                        if sought != xpath:
+                            sought += f' (for {xpath})'
+                        raise Error(f'All lack child {selector} for {sought} in {self.name}')
 
             else: # Found matching elements
+                roots = tuple(self.__skipInheritors(roots))
+                if roots:
+                    foundNone = False
                 for elt in roots:
                     yield elt
 
-        sought = '/'.join(tags)
-        if sought != xpath:
-            sought += ' (for {})'.format(xpath)
-        raise Error('No {} in {}'.format(sought, self.name))
+        if foundNone:
+            sought = '/'.join(tags)
+            if sought != xpath:
+                sought += f' (for {xpath})'
+            raise Error(f'No {sought} in {self.name}')
+
+    @staticmethod
+    def __skipInheritors(elts):
+        for elt in elts:
+            try:
+                if elt.dom.firstChild.nodeValue != INHERIT:
+                    yield elt
+            except (AttributeError, KeyError):
+                yield elt
 
     def __currencyDisplayName(self, stem):
         try:
             return self.find(stem + 'displayName')
         except Error:
             pass
-        for x in  ('zero', 'one', 'two', 'few', 'many', 'other'):
+        for x in ('zero', 'one', 'two', 'few', 'many', 'other'):
             try:
-                return self.find(stem + 'displayName[count={}]'.format(x))
+                return self.find(f'{stem}displayName[count={x}]')
             except Error:
                 pass
         return ''
@@ -474,10 +497,10 @@ class LocaleScanner (object):
         # (even for unitLength[narrow]) instead of kB (etc.), so
         # prefer any unitPattern provided, but prune its placeholder:
         for size in ('short', 'narrow'): # TODO: reverse order ?
-            stem = 'units/unitLength[{}]/unit[digital-{}byte]/'.format(size + keySuffix, quantify)
+            stem = f'units/unitLength[{size}{keySuffix}]/unit[digital-{quantify}byte]/'
             for count in ('many', 'few', 'two', 'other', 'zero', 'one'):
                 try:
-                    ans = self.find(stem + 'unitPattern[count={}]'.format(count))
+                    ans = self.find(f'{stem}unitPattern[count={count}]')
                 except Error:
                     continue
 
@@ -490,7 +513,7 @@ class LocaleScanner (object):
                     return ans
 
             try:
-                return self.find(stem + 'displayName')
+                return self.find(f'{stem}displayName')
             except Error:
                 pass
 
@@ -518,10 +541,10 @@ class LocaleScanner (object):
             if cache:
                 byte = cache.pop()
                 if all(byte == k for k in cache):
-                    suffix = 'i' + byte
+                    suffix = f'i{byte}'
             for q in siQuantifiers:
                 # Those don't (yet, v36) exist in CLDR, so we always get the fall-back:
-                yield self.__findUnit(keySuffix, q[:2], q[0].upper() + suffix)
+                yield self.__findUnit(keySuffix, q[:2], f'{q[0].upper()}{suffix}')
         else: # first call
             tail = suffix = suffix or 'B'
             for q in siQuantifiers:
@@ -556,8 +579,8 @@ class LocaleScanner (object):
         elsewhere)."""
         top = int(self.find('numbers/minimumGroupingDigits'))
         assert top < 4, top # We store it in a 2-bit field
-        grouping = self.find('numbers/decimalFormats[numberSystem='
-                             + system + ']/decimalFormatLength/decimalFormat/pattern')
+        grouping = self.find(f'numbers/decimalFormats[numberSystem={system}]/'
+                             'decimalFormatLength/decimalFormat/pattern')
         groups = grouping.split('.')[0].split(',')[-3:]
         assert all(len(x) < 8 for x in groups[-2:]), grouping # we store them in 3-bit fields
         if len(groups) > 2:
@@ -580,7 +603,7 @@ class LocaleScanner (object):
             # According to http://www.unicode.org/reports/tr35/#Number_Format_Patterns
             # there can be doubled or trippled currency sign, however none of the
             # locales use that.
-            p = p.replace(u'\xa4', "%2")
+            p = p.replace('\xa4', "%2")
             # Single quote goes away, but double goes to single:
             p = p.replace("''", '###').replace("'", '').replace('###', "'")
             # Use number system's signs: