1 files changed, 134 insertions, 173 deletions
diff --git a/util/locale_database/qlocalexml.py b/util/locale_database/qlocalexml.py
index 9cb1f63f8b..dae1894cd2 100644
--- a/util/locale_database/qlocalexml.py
+++ b/util/locale_database/qlocalexml.py
@@ -44,59 +44,6 @@ def startCount(c, text): # strspn
     except StopIteration:
         return len(text)
 
-def convertFormat(format):
-    """Convert date/time format-specier from CLDR to Qt
-
-    Match up (as best we can) the differences between:
-    * https://www.unicode.org/reports/tr35/tr35-dates.html#Date_Field_Symbol_Table
-    * QDateTimeParser::parseFormat() and QLocalePrivate::dateTimeToString()
-    """
-    # Compare and contrast dateconverter.py's convert_date().
-    # Need to (check consistency and) reduce redundancy !
-    result = ""
-    i = 0
-    while i < len(format):
-        if format[i] == "'":
-            result += "'"
-            i += 1
-            while i < len(format) and format[i] != "'":
-                result += format[i]
-                i += 1
-            if i < len(format):
-                result += "'"
-                i += 1
-        else:
-            s = format[i:]
-            if s.startswith('E'): # week-day
-                n = startCount('E', s)
-                if n < 3:
-                    result += 'ddd'
-                elif n == 4:
-                    result += 'dddd'
-                else: # 5: narrow, 6 short; but should be name, not number :-(
-                    result += 'd' if n < 6 else 'dd'
-                i += n
-            elif s[0] in 'ab': # am/pm
-                # 'b' should distinguish noon/midnight, too :-(
-                result += "AP"
-                i += startCount('ab', s)
-            elif s.startswith('S'): # fractions of seconds: count('S') == number of decimals to show
-                result += 'z'
-                i += startCount('S', s)
-            elif s.startswith('V'): # long time zone specifiers (and a deprecated short ID)
-                result += 't'
-                i += startCount('V', s)
-            elif s[0] in 'zv': # zone
-                # Should use full name, e.g. "Central European Time", if 'zzzz' :-(
-                # 'v' should get generic non-location format, e.g. PT for "Pacific Time", no DST indicator
-                result += "t"
-                i += startCount('zv', s)
-            else:
-                result += format[i]
-                i += 1
-
-    return result
-
 class QLocaleXmlReader (object):
     def __init__(self, filename):
         self.root = self.__parse(filename)
@@ -109,14 +56,14 @@ class QLocaleXmlReader (object):
         self.__likely = tuple(self.__likelySubtagsMap())
 
         # Mappings {ID: (enum name, code, en.xml name)}
-        self.languages = dict((v[0], v[1:]) for v in languages)
-        self.scripts = dict((v[0], v[1:]) for v in scripts)
-        self.territories = dict((v[0], v[1:]) for v in territories)
+        self.languages = {v[0]: v[1:] for v in languages}
+        self.scripts = {v[0]: v[1:] for v in scripts}
+        self.territories = {v[0]: v[1:] for v in territories}
 
         # Private mappings {enum name: (ID, code)}
-        self.__langByName = dict((v[1], (v[0], v[2])) for v in languages)
-        self.__textByName = dict((v[1], (v[0], v[2])) for v in scripts)
-        self.__landByName = dict((v[1], (v[0], v[2])) for v in territories)
+        self.__langByName = {v[1]: (v[0], v[2]) for v in languages}
+        self.__textByName = {v[1]: (v[0], v[2]) for v in scripts}
+        self.__landByName = {v[1]: (v[0], v[2]) for v in territories}
         # Other properties:
         self.__dupes = set(v[1] for v in languages) & set(v[1] for v in territories)
         self.cldrVersion = self.__firstChildText(self.root, "version")
@@ -150,6 +97,21 @@ class QLocaleXmlReader (object):
 
             yield (language, script, territory), locale
 
+    def aliasToIana(self):
+        kid = self.__firstChildText
+        for elt in self.__eachEltInGroup(self.root, 'zoneAliases', 'zoneAlias'):
+            yield kid(elt, 'alias'), kid(elt, 'iana')
+
+    def msToIana(self):
+        kid = self.__firstChildText
+        for elt in self.__eachEltInGroup(self.root, 'windowsZone', 'msZoneIana'):
+            yield kid(elt, 'msid'), kid(elt, 'iana')
+
+    def msLandIanas(self):
+        kid = self.__firstChildText
+        for elt in self.__eachEltInGroup(self.root, 'windowsZone', 'msLandZones'):
+            yield kid(elt, 'msid'), kid(elt, 'territorycode'), kid(elt, 'ianaids')
+
     def languageIndices(self, locales):
         index = 0
         for key, value in self.languages.items():
@@ -248,6 +210,8 @@ class QLocaleXmlReader (object):
         child = elt.firstChild
         while child:
             if child.nodeType == elt.TEXT_NODE:
+                # Note: do not strip(), as some group separators are
+                # non-breaking spaces, that strip() will discard.
                 yield child.nodeValue
             child = child.nextSibling
 
@@ -287,17 +251,16 @@ class Spacer (object):
         First argument, indent, is either None (its default, for
         'minifying'), an ingeter (number of spaces) or the unit of
         text that is to be used for each indentation level (e.g. '\t'
-        to use tabs).  If indent is None, no indentation is added, nor
+        to use tabs). If indent is None, no indentation is added, nor
         are line-breaks; otherwise, self(text), for non-empty text,
         shall end with a newline and begin with indentation.
 
         Second argument, initial, is the initial indentation; it is
-        ignored if indent is None.  Indentation increases after each
+        ignored if indent is None. Indentation increases after each
         call to self(text) in which text starts with a tag and doesn't
         include its end-tag; indentation decreases if text starts with
-        an end-tag.  The text is not parsed any more carefully than
-        just described.
-        """
+        an end-tag. The text is not parsed any more carefully than
+        just described."""
         if indent is None:
             self.__call = lambda x: x
         else:
@@ -323,6 +286,10 @@ class Spacer (object):
         return self.__call(line)
 
 class QLocaleXmlWriter (object):
+    """Save the full set of locale data to a QLocaleXML file.
+
+    The output saved by this should conform to qlocalexml.rnc's
+    schema."""
     def __init__(self, save = None, space = Spacer(4)):
         """Set up to write digested CLDR data as QLocale XML.
 
@@ -378,10 +345,48 @@ class QLocaleXmlWriter (object):
             self.__closeTag('likelySubtag')
         self.__closeTag('likelySubtags')
 
-    def locales(self, locales, calendars):
+    def zoneData(self, alias, defaults, windowsIds):
+        self.__openTag('zoneAliases')
+        # iana is a single IANA ID
+        # name has the same form, but has been made redundant
+        for name, iana in sorted(alias.items()):
+            if name == iana:
+                continue
+            self.__openTag('zoneAlias')
+            self.inTag('alias', name)
+            self.inTag('iana', iana)
+            self.__closeTag('zoneAlias')
+        self.__closeTag('zoneAliases')
+
+        self.__openTag('windowsZone')
+        for (msid, code), ids in windowsIds.items():
+            # ianaids is a space-joined sequence of IANA IDs
+            self.__openTag('msLandZones')
+            self.inTag('msid', msid)
+            self.inTag('territorycode', code)
+            self.inTag('ianaids', ids)
+            self.__closeTag('msLandZones')
+
+        for winid, iana in defaults.items():
+            self.__openTag('msZoneIana')
+            self.inTag('msid', winid)
+            self.inTag('iana', iana)
+            self.__closeTag('msZoneIana')
+        self.__closeTag('windowsZone')
+
+    def locales(self, locales, calendars, en_US):
+        """Write the data for each locale.
+
+        First argument, locales, is the mapping whose values are the
+        Locale objects, with each key being the matching tuple of
+        numeric IDs for language, script, territory and variant.
+        Second argument is a tuple of calendar names. Third is the
+        tuple of numeric IDs that corresponds to en_US (needed to
+        provide fallbacks for the C locale)."""
+
         self.__openTag('localeList')
         self.__openTag('locale')
-        self.__writeLocale(Locale.C(calendars), calendars)
+        self.__writeLocale(Locale.C(locales[en_US]), calendars)
         self.__closeTag('locale')
         for key in sorted(locales.keys()):
             self.__openTag('locale')
@@ -396,7 +401,7 @@ class QLocaleXmlWriter (object):
         self.__write(f'<{tag}>{text}</{tag}>')
 
     def close(self, grumble):
-        """Finish writing and grumble any issues discovered."""
+        """Finish writing and grumble about any issues discovered."""
         if self.__rawOutput != self.__complain:
             self.__write('</localeDatabase>')
         self.__rawOutput = self.__complain
@@ -451,7 +456,10 @@ class QLocaleXmlWriter (object):
         self.__scripts.discard(locale.script_code)
         self.__territories.discard(locale.territory_code)
 
-    def __openTag(self, tag):
+    def __openTag(self, tag, **attrs):
+        if attrs:
+            text = ', '.join(f'{k}="{v}"' for k, v in attrs.items())
+            tag = f'{tag} {text}'
         self.__write(f'<{tag}>')
     def __closeTag(self, tag):
         self.__write(f'</{tag}>')
@@ -486,8 +494,6 @@ class Locale (object):
     __asint = ("currencyDigits", "currencyRounding")
     # Convert day-name to Qt day-of-week number:
     __asdow = ("firstDayOfWeek", "weekendStart", "weekendEnd")
-    # Convert from CLDR format-strings to QDateTimeParser ones:
-    __asfmt = ("longDateFormat", "shortDateFormat", "longTimeFormat", "shortTimeFormat")
     # Just use the raw text:
     __astxt = ("language", "languageEndonym", "script", "territory", "territoryEndonym",
                "decimal", "group", "zero",
@@ -496,9 +502,12 @@ class Locale (object):
                "alternateQuotationStart", "alternateQuotationEnd",
                "listPatternPartStart", "listPatternPartMiddle",
                "listPatternPartEnd", "listPatternPartTwo", "am", "pm",
+               "longDateFormat", "shortDateFormat",
+               "longTimeFormat", "shortTimeFormat",
                'byte_unit', 'byte_si_quantified', 'byte_iec_quantified',
                "currencyIsoCode", "currencySymbol", "currencyDisplayName",
-               "currencyFormat", "currencyNegativeFormat")
+               "currencyFormat", "currencyNegativeFormat",
+               )
 
     # Day-of-Week numbering used by Qt:
     __qDoW = {"mon": 1, "tue": 2, "wed": 3, "thu": 4, "fri": 5, "sat": 6, "sun": 7}
@@ -507,12 +516,15 @@ class Locale (object):
     def fromXmlData(cls, lookup, calendars=('gregorian',)):
         """Constructor from the contents of XML elements.
 
-        Single parameter, lookup, is called with the names of XML
-        elements that should contain the relevant data, within a CLDR
-        locale element (within a localeList element); these names are
-        used for the attributes of the object constructed.  Attribute
-        values are obtained by suitably digesting the returned element
-        texts.\n"""
+        First parameter, lookup, is called with the names of XML elements that
+        should contain the relevant data, within a QLocaleXML locale element
+        (within a localeList element); these names mostly match the attributes
+        of the object constructed. Its return must be the full text of the
+        first child DOM node element with the given name. Attribute values are
+        obtained by suitably digesting the returned element texts.
+
+        Optional second parameter, calendars, is a sequence of calendars for
+        which data is to be retrieved."""
         data = {}
         for k in cls.__asint:
             data[k] = int(lookup(k))
@@ -520,14 +532,11 @@ class Locale (object):
         for k in cls.__asdow:
             data[k] = cls.__qDoW[lookup(k)]
 
-        for k in cls.__asfmt:
-            data[k] = convertFormat(lookup(k))
-
         for k in cls.__astxt + tuple(cls.propsMonthDay('days')):
             data['listDelim' if k == 'list' else k] = lookup(k)
 
         for k in cls.propsMonthDay('months'):
-            data[k] = dict((cal, lookup('_'.join((k, cal)))) for cal in calendars)
+            data[k] = {cal: lookup('_'.join((k, cal))) for cal in calendars}
 
         grouping = lookup('groupSizes').split(';')
         data.update(groupLeast = int(grouping[0]),
@@ -566,7 +575,7 @@ class Locale (object):
                     'longDateFormat', 'shortDateFormat',
                     'longTimeFormat', 'shortTimeFormat',
                     'currencyIsoCode', 'currencySymbol', 'currencyDisplayName',
-                    'currencyFormat', 'currencyNegativeFormat'
+                    'currencyFormat', 'currencyNegativeFormat',
                     ) + tuple(self.propsMonthDay('days')) + tuple(
                 '_'.join((k, cal))
                 for k in self.propsMonthDay('months')
@@ -577,97 +586,49 @@ class Locale (object):
         for key in ('currencyDigits', 'currencyRounding'):
             write(key, get(key))
 
-    # Tools used by __monthNames:
-    def fullName(i, name): return name
-    def firstThree(i, name): return name[:3]
-    def initial(i, name): return name[:1]
-    def number(i, name): return str(i + 1)
-    def islamicShort(i, name):
-        if not name: return name
-        if name == 'Shawwal': return 'Shaw.'
-        words = name.split()
-        if words[0].startswith('Dhu'):
-            words[0] = words[0][:7] + '.'
-        elif len(words[0]) > 3:
-            words[0] = words[0][:3] + '.'
-        return ' '.join(words)
-    @staticmethod
-    def __monthNames(calendars,
-                     known={ # Map calendar to (names, extractors...):
-            # TODO: do we even need these ?  CLDR's root.xml seems to
-            # have them, complete with yeartype="leap" handling for
-            # Hebrew's extra.
-            'gregorian': (('January', 'February', 'March', 'April', 'May', 'June', 'July',
-                           'August', 'September', 'October', 'November', 'December'),
-                          # Extractor pairs, (plain, standalone)
-                          (fullName, fullName), # long
-                          (firstThree, firstThree), # short
-                          (number, initial)), # narrow
-            'persian': (('Farvardin', 'Ordibehesht', 'Khordad', 'Tir', 'Mordad',
-                         'Shahrivar', 'Mehr', 'Aban', 'Azar', 'Dey', 'Bahman', 'Esfand'),
-                        (fullName, fullName),
-                        (firstThree, firstThree),
-                        (number, initial)),
-            'islamic': (('Muharram', 'Safar', 'Rabiʻ I', 'Rabiʻ II', 'Jumada I',
-                         'Jumada II', 'Rajab', 'Shaʻban', 'Ramadan', 'Shawwal',
-                         'Dhuʻl-Qiʻdah', 'Dhuʻl-Hijjah'),
-                        (fullName, fullName),
-                        (islamicShort, islamicShort),
-                        (number, number)),
-            'hebrew': (('Tishri', 'Heshvan', 'Kislev', 'Tevet', 'Shevat', 'Adar I',
-                        'Adar', 'Nisan', 'Iyar', 'Sivan', 'Tamuz', 'Av'),
-                       (fullName, fullName),
-                       (fullName, fullName),
-                       (number, number)),
-            },
-                     sizes=('long', 'short', 'narrow')):
-        for cal in calendars:
-            try:
-                data = known[cal]
-            except KeyError as e: # Need to add an entry to known, above.
-                e.args += ('Unsupported calendar:', cal)
-                raise
-            names, get = data[0], data[1:]
-            for n, size in enumerate(sizes):
-                yield ('_'.join((camelCase((size, 'months')), cal)),
-                       ';'.join(get[n][0](i, x) for i, x in enumerate(names)))
-                yield ('_'.join((camelCase(('standalone', size, 'months')), cal)),
-                       ';'.join(get[n][1](i, x) for i, x in enumerate(names)))
-    del fullName, firstThree, initial, number, islamicShort
-
     @classmethod
-    def C(cls, calendars=('gregorian',),
-          days = ('Sunday', 'Monday', 'Tuesday', 'Wednesday',
-                  'Thursday', 'Friday', 'Saturday'),
-          quantifiers=('k', 'M', 'G', 'T', 'P', 'E')):
-        """Returns an object representing the C locale."""
-        return cls(cls.__monthNames(calendars),
-                   language='C', language_code='0', languageEndonym='',
-                   script='AnyScript', script_code='0',
-                   territory='AnyTerritory', territory_code='0', territoryEndonym='',
-                   groupSizes=(3, 3, 1),
-                   decimal='.', group=',', list=';', percent='%',
-                   zero='0', minus='-', plus='+', exp='e',
+    def C(cls, en_US):
+        """Returns an object representing the C locale.
+
+        Required argument, en_US, is the corresponding object for the
+        en_US locale (or the en_US_POSIX one if we ever support
+        variants). The C locale inherits from this, overriding what it
+        may need to."""
+        base = en_US.__dict__.copy()
+        # Soroush's original contribution shortened Jalali month names
+        # - contrary to CLDR, which doesn't abbreviate these in
+        # root.xml or en.xml, although some locales do, e.g. fr_CA.
+        # For compatibility with that,
+        for k in ('shortMonths_persian', 'standaloneShortMonths_persian'):
+            base[k] = ';'.join(x[:3] for x in base[k].split(';'))
+
+        return cls(base,
+                   language='C', language_code='',
+                   language_id=0, languageEndonym='',
+                   script='AnyScript', script_code='', script_id=0,
+                   territory='AnyTerritory', territory_code='',
+                   territory_id=0, territoryEndonym='',
+                   variant='', variant_code='', variant_id=0,
+                   # CLDR has non-ASCII versions of these:
                    quotationStart='"', quotationEnd='"',
-                   alternateQuotationStart='\'', alternateQuotationEnd='\'',
-                   listPatternPartStart='%1, %2',
-                   listPatternPartMiddle='%1, %2',
-                   listPatternPartEnd='%1, %2',
-                   listPatternPartTwo='%1, %2',
-                   byte_unit='bytes',
-                   byte_si_quantified=';'.join(q + 'B' for q in quantifiers),
-                   byte_iec_quantified=';'.join(q.upper() + 'iB' for q in quantifiers),
-                   am='AM', pm='PM', firstDayOfWeek='mon',
-                   weekendStart='sat', weekendEnd='sun',
-                   longDateFormat='EEEE, d MMMM yyyy', shortDateFormat='d MMM yyyy',
-                   longTimeFormat='HH:mm:ss z', shortTimeFormat='HH:mm:ss',
-                   longDays=';'.join(days),
-                   shortDays=';'.join(d[:3] for d in days),
-                   narrowDays='7;1;2;3;4;5;6',
-                   standaloneLongDays=';'.join(days),
-                   standaloneShortDays=';'.join(d[:3] for d in days),
-                   standaloneNarrowDays=';'.join(d[:1] for d in days),
-                   currencyIsoCode='', currencySymbol='',
-                   currencyDisplayName='',
+                   alternateQuotationStart="'", alternateQuotationEnd="'",
+                   # CLDR gives 'dddd, MMMM d, yyyy', 'M/d/yy', 'h:mm:ss Ap tttt',
+                   # 'h:mm Ap' with non-breaking space before Ap.
+                   longDateFormat='dddd, d MMMM yyyy', shortDateFormat='d MMM yyyy',
+                   longTimeFormat='HH:mm:ss t', shortTimeFormat='HH:mm:ss',
+                   # CLDR has US-$ and US-style formats:
+                   currencyIsoCode='', currencySymbol='', currencyDisplayName='',
                    currencyDigits=2, currencyRounding=1,
-                   currencyFormat='%1%2', currencyNegativeFormat='')
+                   currencyFormat='%1%2', currencyNegativeFormat='',
+                   # We may want to fall back to CLDR for some of these:
+                   firstDayOfWeek='mon', # CLDR has 'sun'
+                   exp='e', # CLDR has 'E'
+                   listPatternPartEnd='%1, %2', # CLDR has '%1, and %2'
+                   listPatternPartTwo='%1, %2', # CLDR has '%1 and %2'
+                   narrowDays='7;1;2;3;4;5;6', # CLDR has letters
+                   narrowMonths_gregorian='1;2;3;4;5;6;7;8;9;10;11;12', # CLDR has letters
+                   standaloneNarrowMonths_persian='F;O;K;T;M;S;M;A;A;D;B;E', # CLDR has digits
+                   # Keep these explicit, despite matching CLDR:
+                   decimal='.', group=',', percent='%',
+                   zero='0', minus='-', plus='+',
+                   am='AM', pm='PM', weekendStart='sat', weekendEnd='sun')