1 files changed, 152 insertions, 154 deletions
diff --git a/util/locale_database/qlocalexml.py b/util/locale_database/qlocalexml.py
index e5aadba995..5cb56c2165 100644
--- a/util/locale_database/qlocalexml.py
+++ b/util/locale_database/qlocalexml.py
@@ -1,31 +1,5 @@
-# coding=utf8
-#############################################################################
-##
-## Copyright (C) 2020 The Qt Company Ltd.
-## Contact: https://www.qt.io/licensing/
-##
-## This file is part of the test suite of the Qt Toolkit.
-##
-## $QT_BEGIN_LICENSE:GPL-EXCEPT$
-## Commercial License Usage
-## Licensees holding valid commercial Qt licenses may use this file in
-## accordance with the commercial license agreement provided with the
-## Software or, alternatively, in accordance with the terms contained in
-## a written agreement between you and The Qt Company. For licensing terms
-## and conditions see https://www.qt.io/terms-conditions. For further
-## information use the contact form at https://www.qt.io/contact-us.
-##
-## GNU General Public License Usage
-## Alternatively, this file may be used under the terms of the GNU
-## General Public License version 3 as published by the Free Software
-## Foundation with exceptions as appearing in the file LICENSE.GPL3-EXCEPT
-## included in the packaging of this file. Please review the following
-## information to ensure the GNU General Public License requirements will
-## be met: https://www.gnu.org/licenses/gpl-3.0.html.
-##
-## $QT_END_LICENSE$
-##
-#############################################################################
+# Copyright (C) 2021 The Qt Company Ltd.
+# SPDX-License-Identifier: LicenseRef-Qt-Commercial OR GPL-3.0-only WITH Qt-GPL-exception-1.0
 """Shared serialization-scanning code for QLocaleXML format.
 
 Provides classes:
@@ -35,15 +9,23 @@ Provides classes:
 
 Support:
   Spacer -- provides control over indentation of the output.
+
+RelaxNG schema for the used file format can be found in qlocalexml.rnc.
+QLocaleXML files can be validated using:
+
+    jing -c qlocalexml.rnc <file.xml>
+
+You can download jing from https://relaxng.org/jclark/jing.html if your
+package manager lacks the jing package.
 """
-from __future__ import print_function
+
 from xml.sax.saxutils import escape
 
 from localetools import Error
 
 # Tools used by Locale:
 def camel(seq):
-    yield seq.next()
+    yield next(seq)
     for word in seq:
         yield word.capitalize()
 
@@ -51,88 +33,39 @@ def camelCase(words):
     return ''.join(camel(iter(words)))
 
 def addEscapes(s):
-    return ''.join(c if n < 128 else '\\x{:02x}'.format(n)
+    return ''.join(c if n < 128 else f'\\x{n:02x}'
                    for n, c in ((ord(c), c) for c in s))
 
 def startCount(c, text): # strspn
     """First index in text where it doesn't have a character in c"""
     assert text and text[0] in c
     try:
-        return (j for j, d in enumerate(text) if d not in c).next()
+        return next((j for j, d in enumerate(text) if d not in c))
     except StopIteration:
         return len(text)
 
-def convertFormat(format):
-    """Convert date/time format-specier from CLDR to Qt
-
-    Match up (as best we can) the differences between:
-    * https://www.unicode.org/reports/tr35/tr35-dates.html#Date_Field_Symbol_Table
-    * QDateTimeParser::parseFormat() and QLocalePrivate::dateTimeToString()
-    """
-    # Compare and contrast dateconverter.py's convert_date().
-    # Need to (check consistency and) reduce redundancy !
-    result = ""
-    i = 0
-    while i < len(format):
-        if format[i] == "'":
-            result += "'"
-            i += 1
-            while i < len(format) and format[i] != "'":
-                result += format[i]
-                i += 1
-            if i < len(format):
-                result += "'"
-                i += 1
-        else:
-            s = format[i:]
-            if s.startswith('E'): # week-day
-                n = startCount('E', s)
-                if n < 3:
-                    result += 'ddd'
-                elif n == 4:
-                    result += 'dddd'
-                else: # 5: narrow, 6 short; but should be name, not number :-(
-                    result += 'd' if n < 6 else 'dd'
-                i += n
-            elif s[0] in 'ab': # am/pm
-                # 'b' should distinguish noon/midnight, too :-(
-                result += "AP"
-                i += startCount('ab', s)
-            elif s.startswith('S'): # fractions of seconds: count('S') == number of decimals to show
-                result += 'z'
-                i += startCount('S', s)
-            elif s.startswith('V'): # long time zone specifiers (and a deprecated short ID)
-                result += 't'
-                i += startCount('V', s)
-            elif s[0] in 'zv': # zone
-                # Should use full name, e.g. "Central European Time", if 'zzzz' :-(
-                # 'v' should get generic non-location format, e.g. PT for "Pacific Time", no DST indicator
-                result += "t"
-                i += startCount('zv', s)
-            else:
-                result += format[i]
-                i += 1
-
-    return result
-
 class QLocaleXmlReader (object):
     def __init__(self, filename):
         self.root = self.__parse(filename)
-        # Lists of (id, name, code) triples:
-        languages = tuple(self.__loadMap('language'))
-        scripts = tuple(self.__loadMap('script'))
-        countries = tuple(self.__loadMap('country'))
+
+        from enumdata import language_map, script_map, territory_map
+        # Lists of (id, enum name, code, en.xml name) tuples:
+        languages = tuple(self.__loadMap('language', language_map))
+        scripts = tuple(self.__loadMap('script', script_map))
+        territories = tuple(self.__loadMap('territory', territory_map))
         self.__likely = tuple(self.__likelySubtagsMap())
-        # Mappings {ID: (name, code)}
-        self.languages = dict((v[0], v[1:]) for v in languages)
-        self.scripts = dict((v[0], v[1:]) for v in scripts)
-        self.countries = dict((v[0], v[1:]) for v in countries)
-        # Private mappings {name: (ID, code)}
-        self.__langByName = dict((v[1], (v[0], v[2])) for v in languages)
-        self.__textByName = dict((v[1], (v[0], v[2])) for v in scripts)
-        self.__landByName = dict((v[1], (v[0], v[2])) for v in countries)
+
+        # Mappings {ID: (enum name, code, en.xml name)}
+        self.languages = {v[0]: v[1:] for v in languages}
+        self.scripts = {v[0]: v[1:] for v in scripts}
+        self.territories = {v[0]: v[1:] for v in territories}
+
+        # Private mappings {enum name: (ID, code)}
+        self.__langByName = {v[1]: (v[0], v[2]) for v in languages}
+        self.__textByName = {v[1]: (v[0], v[2]) for v in scripts}
+        self.__landByName = {v[1]: (v[0], v[2]) for v in territories}
         # Other properties:
-        self.dupes = set(v[1] for v in languages) & set(v[1] for v in countries)
+        self.__dupes = set(v[1] for v in languages) & set(v[1] for v in territories)
         self.cldrVersion = self.__firstChildText(self.root, "version")
 
     def loadLocaleMap(self, calendars, grumble = lambda text: None):
@@ -142,31 +75,31 @@ class QLocaleXmlReader (object):
             locale = Locale.fromXmlData(lambda k: kid(elt, k), calendars)
             language = self.__langByName[locale.language][0]
             script = self.__textByName[locale.script][0]
-            country = self.__landByName[locale.country][0]
+            territory = self.__landByName[locale.territory][0]
 
             if language != 1: # C
-                if country == 0:
-                    grumble('loadLocaleMap: No country id for "{}"\n'.format(locale.language))
+                if territory == 0:
+                    grumble(f'loadLocaleMap: No territory id for "{locale.language}"\n')
 
                 if script == 0:
-                    # Find default script for the given language and country - see:
+                    # Find default script for the given language and territory - see:
                     # http://www.unicode.org/reports/tr35/#Likely_Subtags
                     try:
                         try:
-                            to = likely[(locale.language, 'AnyScript', locale.country)]
+                            to = likely[(locale.language, 'AnyScript', locale.territory)]
                         except KeyError:
-                            to = likely[(locale.language, 'AnyScript', 'AnyCountry')]
+                            to = likely[(locale.language, 'AnyScript', 'AnyTerritory')]
                     except KeyError:
                         pass
                     else:
                         locale.script = to[1]
                         script = self.__textByName[locale.script][0]
 
-            yield (language, script, country), locale
+            yield (language, script, territory), locale
 
     def languageIndices(self, locales):
         index = 0
-        for key, value in self.languages.iteritems():
+        for key, value in self.languages.items():
             i, count = 0, locales.count(key)
             if count > 0:
                 i = index
@@ -190,26 +123,53 @@ class QLocaleXmlReader (object):
                    '_'.join(tag(give)), ids(give))
 
     def defaultMap(self):
-        """Map language and script to their default country by ID.
+        """Map language and script to their default territory by ID.
 
-        Yields ((language, script), country) wherever the likely
+        Yields ((language, script), territory) wherever the likely
         sub-tags mapping says language's default locale uses the given
-        script and country."""
+        script and territory."""
         for have, give in self.__likely:
-            if have[1:] == ('AnyScript', 'AnyCountry') and give[2] != 'AnyCountry':
+            if have[1:] == ('AnyScript', 'AnyTerritory') and give[2] != 'AnyTerritory':
                 assert have[0] == give[0], (have, give)
                 yield ((self.__langByName[give[0]][0],
                         self.__textByName[give[1]][0]),
                        self.__landByName[give[2]][0])
 
+    def enumify(self, name, suffix):
+        """Stick together the parts of an enumdata.py name.
+
+        Names given in enumdata.py include spaces and hyphens that we
+        can't include in an identifier, such as the name of a member
+        of an enum type. Removing those would lose the word
+        boundaries, so make sure each word starts with a capital (but
+        don't simply capitalize() as some names contain words,
+        e.g. McDonald, that have later capitals in them).
+
+        We also need to resolve duplication between languages and
+        territories (by adding a suffix to each) and add Script to the
+        ends of script-names that don't already end in it."""
+        name = name.replace('-', ' ')
+        # Don't .capitalize() as McDonald is already camel-case (see enumdata.py):
+        name = ''.join(word[0].upper() + word[1:] for word in name.split())
+        if suffix != 'Script':
+            assert not(name in self.__dupes and name.endswith(suffix))
+            return name + suffix if name in self.__dupes else name
+
+        if not name.endswith(suffix):
+            name += suffix
+        if name in self.__dupes:
+            raise Error(f'The script name "{name}" is messy')
+        return name
+
     # Implementation details:
-    def __loadMap(self, category):
+    def __loadMap(self, category, enum):
         kid = self.__firstChildText
-        for element in self.__eachEltInGroup(self.root, category + 'List', category):
-            yield int(kid(element, 'id')), kid(element, 'name'), kid(element, 'code')
+        for element in self.__eachEltInGroup(self.root, f'{category}List', category):
+            key = int(kid(element, 'id'))
+            yield key, enum[key][0], kid(element, 'code'), kid(element, 'name')
 
     def __likelySubtagsMap(self):
-        def triplet(element, keys=('language', 'script', 'country'), kid = self.__firstChildText):
+        def triplet(element, keys=('language', 'script', 'territory'), kid = self.__firstChildText):
             return tuple(kid(element, key) for key in keys)
 
         kid = self.__firstChildElt
@@ -246,7 +206,7 @@ class QLocaleXmlReader (object):
                 return child
             child = child.nextSibling
 
-        raise Error('No {} child found'.format(name))
+        raise Error(f'No {name} child found')
 
     @classmethod
     def __firstChildText(cls, elt, key):
@@ -302,7 +262,7 @@ class Spacer (object):
         elif line.startswith('<') and not line.startswith('<!'):
             cut = line.find('>')
             tag = (line[1:] if cut < 0 else line[1 : cut]).strip().split()[0]
-            if '</{}>'.format(tag) not in line:
+            if f'</{tag}>' not in line:
                 self.current += self.__each
         return indent + line + '\n'
 
@@ -333,10 +293,28 @@ class QLocaleXmlWriter (object):
         self.__write('<localeDatabase>')
 
     # Output of various sections, in their usual order:
-    def enumData(self, languages, scripts, countries):
-        self.__enumTable('language', languages)
-        self.__enumTable('script', scripts)
-        self.__enumTable('country', countries)
+    def enumData(self, code2name):
+        """Output name/id/code tables for language, script and territory.
+
+        Parameter, code2name, is a function taking 'language',
+        'script' or 'territory' and returning a lookup function that
+        maps codes, of the relevant type, to their English names. This
+        lookup function is passed a code and the name, both taken from
+        enumdata.py, that QLocale uses, so the .get() of a dict will
+        work. The English name from this lookup will be used by
+        QLocale::*ToString() for the enum member whose name is based
+        on the enumdata.py name passed as fallback to the lookup."""
+        from enumdata import language_map, script_map, territory_map
+        self.__enumTable('language', language_map, code2name)
+        self.__enumTable('script', script_map, code2name)
+        self.__enumTable('territory', territory_map, code2name)
+        # Prepare to detect any unused codes (see __writeLocale(), close()):
+        self.__languages = set(p[1] for p in language_map.values()
+                               if not p[1].isspace())
+        self.__scripts = set(p[1] for p in script_map.values()
+                             if p[1] != 'Zzzz')
+        self.__territories = set(p[1] for p in territory_map.values()
+                                 if p[1] != 'ZZ')
 
     def likelySubTags(self, entries):
         self.__openTag('likelySubtags')
@@ -350,13 +328,11 @@ class QLocaleXmlWriter (object):
     def locales(self, locales, calendars):
         self.__openTag('localeList')
         self.__openTag('locale')
-        Locale.C(calendars).toXml(self.inTag, calendars)
+        self.__writeLocale(Locale.C(calendars), calendars)
         self.__closeTag('locale')
-        keys = locales.keys()
-        keys.sort()
-        for key in keys:
+        for key in sorted(locales.keys()):
             self.__openTag('locale')
-            locales[key].toXml(self.inTag, calendars)
+            self.__writeLocale(locales[key], calendars)
             self.__closeTag('locale')
         self.__closeTag('localeList')
 
@@ -364,13 +340,27 @@ class QLocaleXmlWriter (object):
         self.inTag('version', cldrVersion)
 
     def inTag(self, tag, text):
-        self.__write('<{0}>{1}</{0}>'.format(tag, text))
+        self.__write(f'<{tag}>{text}</{tag}>')
 
-    def close(self):
+    def close(self, grumble):
+        """Finish writing and grumble about any issues discovered."""
         if self.__rawOutput != self.__complain:
             self.__write('</localeDatabase>')
         self.__rawOutput = self.__complain
 
+        if self.__languages or self.__scripts or self.__territories:
+            grumble('Some enum members are unused, corresponding to these tags:\n')
+            import textwrap
+            def kvetch(kind, seq, g = grumble, w = textwrap.wrap):
+                g('\n\t'.join(w(f' {kind}: {", ".join(sorted(seq))}', width=80)) + '\n')
+            if self.__languages:
+                kvetch('Languages', self.__languages)
+            if self.__scripts:
+                kvetch('Scripts', self.__scripts)
+            if self.__territories:
+                kvetch('Territories', self.__territories)
+            grumble('It may make sense to deprecate them.\n')
+
     # Implementation details
     @staticmethod
     def __printit(text):
@@ -379,28 +369,39 @@ class QLocaleXmlWriter (object):
     def __complain(text):
         raise Error('Attempted to write data after closing :-(')
 
-    def __enumTable(self, tag, table):
-        self.__openTag(tag + 'List')
-        for key, value in table.iteritems():
+    @staticmethod
+    def __xmlSafe(text):
+        return text.replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;')
+
+    def __enumTable(self, tag, table, code2name):
+        self.__openTag(f'{tag}List')
+        enname, safe = code2name(tag), self.__xmlSafe
+        for key, (name, code) in table.items():
             self.__openTag(tag)
-            self.inTag('name', value[0])
+            self.inTag('name', safe(enname(code, name)))
             self.inTag('id', key)
-            self.inTag('code', value[1])
+            self.inTag('code', code)
             self.__closeTag(tag)
-        self.__closeTag(tag + 'List')
+        self.__closeTag(f'{tag}List')
 
     def __likelySubTag(self, tag, likely):
         self.__openTag(tag)
         self.inTag('language', likely[0])
         self.inTag('script', likely[1])
-        self.inTag('country', likely[2])
+        self.inTag('territory', likely[2])
         # self.inTag('variant', likely[3])
         self.__closeTag(tag)
 
+    def __writeLocale(self, locale, calendars):
+        locale.toXml(self.inTag, calendars)
+        self.__languages.discard(locale.language_code)
+        self.__scripts.discard(locale.script_code)
+        self.__territories.discard(locale.territory_code)
+
     def __openTag(self, tag):
-        self.__write('<{}>'.format(tag))
+        self.__write(f'<{tag}>')
     def __closeTag(self, tag):
-        self.__write('</{}>'.format(tag))
+        self.__write(f'</{tag}>')
 
     def __write(self, line):
         self.__rawOutput(self.__wrap(line))
@@ -432,16 +433,16 @@ class Locale (object):
     __asint = ("currencyDigits", "currencyRounding")
     # Convert day-name to Qt day-of-week number:
     __asdow = ("firstDayOfWeek", "weekendStart", "weekendEnd")
-    # Convert from CLDR format-strings to QDateTimeParser ones:
-    __asfmt = ("longDateFormat", "shortDateFormat", "longTimeFormat", "shortTimeFormat")
     # Just use the raw text:
-    __astxt = ("language", "languageEndonym", "script", "country", "countryEndonym",
+    __astxt = ("language", "languageEndonym", "script", "territory", "territoryEndonym",
                "decimal", "group", "zero",
                "list", "percent", "minus", "plus", "exp",
                "quotationStart", "quotationEnd",
                "alternateQuotationStart", "alternateQuotationEnd",
                "listPatternPartStart", "listPatternPartMiddle",
                "listPatternPartEnd", "listPatternPartTwo", "am", "pm",
+               "longDateFormat", "shortDateFormat",
+               "longTimeFormat", "shortTimeFormat",
                'byte_unit', 'byte_si_quantified', 'byte_iec_quantified',
                "currencyIsoCode", "currencySymbol", "currencyDisplayName",
                "currencyFormat", "currencyNegativeFormat")
@@ -466,14 +467,11 @@ class Locale (object):
         for k in cls.__asdow:
             data[k] = cls.__qDoW[lookup(k)]
 
-        for k in cls.__asfmt:
-            data[k] = convertFormat(lookup(k))
-
         for k in cls.__astxt + tuple(cls.propsMonthDay('days')):
             data['listDelim' if k == 'list' else k] = lookup(k)
 
         for k in cls.propsMonthDay('months'):
-            data[k] = dict((cal, lookup('_'.join((k, cal)))) for cal in calendars)
+            data[k] = {cal: lookup('_'.join((k, cal))) for cal in calendars}
 
         grouping = lookup('groupSizes').split(';')
         data.update(groupLeast = int(grouping[0]),
@@ -493,15 +491,15 @@ class Locale (object):
         form used by CLDR; its default is ('gregorian',).
         """
         get = lambda k: getattr(self, k)
-        for key in ('language', 'script', 'country'):
+        for key in ('language', 'script', 'territory'):
             write(key, get(key))
-            write('{}code'.format(key), get('{}_code'.format(key)))
+            write(f'{key}code', get(f'{key}_code'))
 
         for key in ('decimal', 'group', 'zero', 'list',
                     'percent', 'minus', 'plus', 'exp'):
             write(key, get(key))
 
-        for key in ('languageEndonym', 'countryEndonym',
+        for key in ('languageEndonym', 'territoryEndonym',
                     'quotationStart', 'quotationEnd',
                     'alternateQuotationStart', 'alternateQuotationEnd',
                     'listPatternPartStart', 'listPatternPartMiddle',
@@ -517,7 +515,7 @@ class Locale (object):
                 '_'.join((k, cal))
                 for k in self.propsMonthDay('months')
                 for cal in calendars):
-            write(key, escape(get(key)).encode('utf-8'))
+            write(key, escape(get(key)))
 
         write('groupSizes', ';'.join(str(x) for x in get('groupSizes')))
         for key in ('currencyDigits', 'currencyRounding'):
@@ -554,9 +552,9 @@ class Locale (object):
                         (fullName, fullName),
                         (firstThree, firstThree),
                         (number, initial)),
-            'islamic': ((u'Muharram', u'Safar', u'Rabiʻ I', u'Rabiʻ II', u'Jumada I',
-                         u'Jumada II', u'Rajab', u'Shaʻban', u'Ramadan', u'Shawwal',
-                         u'Dhuʻl-Qiʻdah', u'Dhuʻl-Hijjah'),
+            'islamic': (('Muharram', 'Safar', 'Rabiʻ I', 'Rabiʻ II', 'Jumada I',
+                         'Jumada II', 'Rajab', 'Shaʻban', 'Ramadan', 'Shawwal',
+                         'Dhuʻl-Qiʻdah', 'Dhuʻl-Hijjah'),
                         (fullName, fullName),
                         (islamicShort, islamicShort),
                         (number, number)),
@@ -565,7 +563,7 @@ class Locale (object):
                        (fullName, fullName),
                        (fullName, fullName),
                        (number, number)),
-            },
+                     },
                      sizes=('long', 'short', 'narrow')):
         for cal in calendars:
             try:
@@ -590,7 +588,7 @@ class Locale (object):
         return cls(cls.__monthNames(calendars),
                    language='C', language_code='0', languageEndonym='',
                    script='AnyScript', script_code='0',
-                   country='AnyCountry', country_code='0', countryEndonym='',
+                   territory='AnyTerritory', territory_code='0', territoryEndonym='',
                    groupSizes=(3, 3, 1),
                    decimal='.', group=',', list=';', percent='%',
                    zero='0', minus='-', plus='+', exp='e',
@@ -605,8 +603,8 @@ class Locale (object):
                    byte_iec_quantified=';'.join(q.upper() + 'iB' for q in quantifiers),
                    am='AM', pm='PM', firstDayOfWeek='mon',
                    weekendStart='sat', weekendEnd='sun',
-                   longDateFormat='EEEE, d MMMM yyyy', shortDateFormat='d MMM yyyy',
-                   longTimeFormat='HH:mm:ss z', shortTimeFormat='HH:mm:ss',
+                   longDateFormat='dddd, d MMMM yyyy', shortDateFormat='d MMM yyyy',
+                   longTimeFormat='HH:mm:ss t', shortTimeFormat='HH:mm:ss',
                    longDays=';'.join(days),
                    shortDays=';'.join(d[:3] for d in days),
                    narrowDays='7;1;2;3;4;5;6',