2 files changed, 330 insertions, 197 deletions
diff --git a/util/locale_database/cldr2qlocalexml.py b/util/locale_database/cldr2qlocalexml.py
index ee53381b22..fba8d7fdd5 100755
--- a/util/locale_database/cldr2qlocalexml.py
+++ b/util/locale_database/cldr2qlocalexml.py
@@ -61,13 +61,13 @@ import enumdata
 import xpathlite
 from xpathlite import DraftResolution, findAlias, findEntry, findTagsInFile
 from dateconverter import convert_date
-from qlocalexml import Locale
+from qlocalexml import Locale, QLocaleXmlWriter
 
 # TODO: make calendars a command-line option
 calendars = ['gregorian', 'persian', 'islamic'] # 'hebrew'
 findEntryInFile = xpathlite._findEntryInFile
-def wrappedwarn(prefix, tokens):
-    return sys.stderr.write(
+def wrappedwarn(err, prefix, tokens):
+    return err.write(
         '\n'.join(textwrap.wrap(prefix + ', '.join(tokens),
                                 subsequent_indent=' ', width=80)) + '\n')
 
@@ -101,6 +101,7 @@ def parse_number_format(patterns, data):
         result.append(pattern)
     return result
 
+cldr_dir = None
 def raiseUnknownCode(code, form, cache={}):
     """Check whether an unknown code could be supported.
 
@@ -193,8 +194,8 @@ def getNumberSystems(cache={}):
     """Cached look-up of number system information.
 
     Pass no arguments.  Returns a mapping from number system names to,
-    for each system, a mapping with keys u'digits', u'type' and
-    u'id'\n"""
+    for each system, a mapping with keys 'digits', 'type' and 'id'.
+    Relies on global cldr_dir being set before it's first called.\n"""
     if not cache:
         for ns in findTagsInFile(os.path.join(cldr_dir, '..', 'supplemental',
                                               'numberingSystems.xml'),
@@ -419,26 +420,7 @@ def _generateLocaleInfo(path, language_code, script_code, country_code, variant_
 
     return Locale(result)
 
-def addEscapes(s):
-    result = ''
-    for c in s:
-        n = ord(c)
-        if n < 128:
-            result += c
-        else:
-            result += "\\x"
-            result += "%02x" % (n)
-    return result
-
-def unicodeStr(s):
-    utf8 = s.encode('utf-8')
-    return "<size>" + str(len(utf8)) + "</size><data>" + addEscapes(utf8) + "</data>"
-
-def usage():
-    print "Usage: cldr2qlocalexml.py <path-to-cldr-main>"
-    sys.exit()
-
-def integrateWeekData(filePath):
+def integrateWeekData(filePath, locale_database):
     if not filePath.endswith(".xml"):
         return {}
 
@@ -510,111 +492,6 @@ def splitLocale(name):
     tag = (tag if tag else tags.next(),)
     sys.stderr.write('Ignoring unparsed cruft %s in %s\n' % ('_'.join(tag + tuple(tags)), name))
 
-if len(sys.argv) != 2:
-    usage()
-
-cldr_dir = sys.argv[1]
-
-if not os.path.isdir(cldr_dir):
-    usage()
-
-cldr_files = os.listdir(cldr_dir)
-
-locale_database = {}
-
-# see http://www.unicode.org/reports/tr35/tr35-info.html#Default_Content
-defaultContent_locales = []
-for ns in findTagsInFile(os.path.join(cldr_dir, '..', 'supplemental',
-                                      'supplementalMetadata.xml'),
-                         'metadata/defaultContent'):
-    for data in ns[1:][0]:
-        if data[0] == u"locales":
-            defaultContent_locales += data[1].split()
-
-skips = []
-for file in defaultContent_locales:
-    try:
-        language_code, script_code, country_code = splitLocale(file)
-    except ValueError:
-        sys.stderr.write('skipping defaultContent locale "' + file + '" [neither two nor three tags]\n')
-        continue
-
-    if not (script_code or country_code):
-        sys.stderr.write('skipping defaultContent locale "' + file + '" [second tag is neither script nor territory]\n')
-        continue
-
-    try:
-        l = _generateLocaleInfo(cldr_dir + "/" + file + ".xml", language_code, script_code, country_code)
-        if not l:
-            skips.append(file)
-            continue
-    except xpathlite.Error as e:
-        sys.stderr.write('skipping defaultContent locale "%s" (%s)\n' % (file, str(e)))
-        continue
-
-    locale_database[(l.language_id, l.script_id, l.country_id, l.variant_code)] = l
-
-if skips:
-    wrappedwarn('skipping defaultContent locales [no locale info generated]: ', skips)
-    skips = []
-
-for file in cldr_files:
-    try:
-        l = generateLocaleInfo(cldr_dir + "/" + file)
-        if not l:
-            skips.append(file)
-            continue
-    except xpathlite.Error as e:
-        sys.stderr.write('skipping file "%s" (%s)\n' % (file, str(e)))
-        continue
-
-    locale_database[(l.language_id, l.script_id, l.country_id, l.variant_code)] = l
-
-if skips:
-    wrappedwarn('skipping files [no locale info generated]: ', skips)
-
-integrateWeekData(cldr_dir+"/../supplemental/supplementalData.xml")
-locale_keys = locale_database.keys()
-locale_keys.sort()
-
-cldr_version = 'unknown'
-ldml = open(cldr_dir+"/../dtd/ldml.dtd", "r")
-for line in ldml:
-    if 'version cldrVersion CDATA #FIXED' in line:
-        cldr_version = line.split('"')[1]
-
-print "<localeDatabase>"
-print "    <version>" + cldr_version + "</version>"
-print "    <languageList>"
-for id in enumdata.language_list:
-    l = enumdata.language_list[id]
-    print "        <language>"
-    print "            <name>" + l[0] + "</name>"
-    print "            <id>" + str(id) + "</id>"
-    print "            <code>" + l[1] + "</code>"
-    print "        </language>"
-print "    </languageList>"
-
-print "    <scriptList>"
-for id in enumdata.script_list:
-    l = enumdata.script_list[id]
-    print "        <script>"
-    print "            <name>" + l[0] + "</name>"
-    print "            <id>" + str(id) + "</id>"
-    print "            <code>" + l[1] + "</code>"
-    print "        </script>"
-print "    </scriptList>"
-
-print "    <countryList>"
-for id in enumdata.country_list:
-    l = enumdata.country_list[id]
-    print "        <country>"
-    print "            <name>" + l[0] + "</name>"
-    print "            <id>" + str(id) + "</id>"
-    print "            <code>" + l[1] + "</code>"
-    print "        </country>"
-print "    </countryList>"
-
 def _parseLocale(l):
     language = "AnyLanguage"
     script = "AnyScript"
@@ -651,48 +528,135 @@ def _parseLocale(l):
 
     return (language, script, country)
 
-skips = []
-print "    <likelySubtags>"
-for ns in findTagsInFile(cldr_dir + "/../supplemental/likelySubtags.xml", "likelySubtags"):
-    tmp = {}
-    for data in ns[1:][0]: # ns looks like this: [u'likelySubtag', [(u'from', u'aa'), (u'to', u'aa_Latn_ET')]]
-        tmp[data[0]] = data[1]
+def likelySubtags(root, err):
+    skips = []
+    for ns in findTagsInFile(os.path.join(root, 'supplemental', 'likelySubtags.xml'), "likelySubtags"):
+        tmp = {}
+        for data in ns[1:][0]: # ns looks like this: [u'likelySubtag', [(u'from', u'aa'), (u'to', u'aa_Latn_ET')]]
+            tmp[data[0]] = data[1]
+
+        try:
+            from_language, from_script, from_country = _parseLocale(tmp[u"from"])
+            to_language, to_script, to_country = _parseLocale(tmp[u"to"])
+        except xpathlite.Error as e:
+            if tmp[u'to'].startswith(tmp[u'from']) and str(e) == 'unknown language code "%s"' % tmp[u'from']:
+                skips.append(tmp[u'to'])
+            else:
+                sys.stderr.write('skipping likelySubtag "%s" -> "%s" (%s)\n' % (tmp[u"from"], tmp[u"to"], str(e)))
+            continue
+        # substitute according to http://www.unicode.org/reports/tr35/#Likely_Subtags
+        if to_country == "AnyCountry" and from_country != to_country:
+            to_country = from_country
+        if to_script == "AnyScript" and from_script != to_script:
+            to_script = from_script
+
+        yield ((from_language, from_script, from_country),
+               (to_language, to_script, to_country))
+    if skips:
+        wrappedwarn(err, 'skipping likelySubtags (for unknown language codes): ', skips)
+
+def usage(err, name, message = ''):
+    err.write("""Usage: {} <path-to-cldr-main> [out-file.xml]
+""".format(name)) # TODO: expand
+    if message:
+        err.write('\n' + message + '\n')
+
+def main(args, out, err):
+    name = args.pop(0)
+
+    if len(args) < 1:
+        usage(err, name)
+        return 1
+
+    global cldr_dir
+    cldr_dir = args.pop(0)
+    if not os.path.isdir(cldr_dir):
+        usage(err, name, 'Where did you unpack the CLDR data files ?')
+        return 1
+
+    if len(args) > 1:
+        usage(err, name, 'Too many arguments passed')
+        return 1
+    if args:
+        qxml = open(args.pop(0), 'w')
+    else:
+        qxml = out
 
-    try:
-        from_language, from_script, from_country = _parseLocale(tmp[u"from"])
-        to_language, to_script, to_country = _parseLocale(tmp[u"to"])
-    except xpathlite.Error as e:
-        if tmp[u'to'].startswith(tmp[u'from']) and str(e) == 'unknown language code "%s"' % tmp[u'from']:
-            skips.append(tmp[u'to'])
-        else:
-            sys.stderr.write('skipping likelySubtag "%s" -> "%s" (%s)\n' % (tmp[u"from"], tmp[u"to"], str(e)))
-        continue
-    # substitute according to http://www.unicode.org/reports/tr35/#Likely_Subtags
-    if to_country == "AnyCountry" and from_country != to_country:
-        to_country = from_country
-    if to_script == "AnyScript" and from_script != to_script:
-        to_script = from_script
-
-    print "        <likelySubtag>"
-    print "            <from>"
-    print "                <language>" + from_language + "</language>"
-    print "                <script>" + from_script + "</script>"
-    print "                <country>" + from_country + "</country>"
-    print "            </from>"
-    print "            <to>"
-    print "                <language>" + to_language + "</language>"
-    print "                <script>" + to_script + "</script>"
-    print "                <country>" + to_country + "</country>"
-    print "            </to>"
-    print "        </likelySubtag>"
-print "    </likelySubtags>"
-if skips:
-    wrappedwarn('skipping likelySubtags (for unknown language codes): ', skips)
-print "    <localeList>"
-
-Locale.C(calendars).toXml(calendars)
-for key in locale_keys:
-    locale_database[key].toXml(calendars)
-
-print "    </localeList>"
-print "</localeDatabase>"
+    getNumberSystems(cldr_dir)
+    cldr_files = os.listdir(cldr_dir)
+    locale_database = {}
+
+    # see http://www.unicode.org/reports/tr35/tr35-info.html#Default_Content
+    defaultContent_locales = []
+    for ns in findTagsInFile(os.path.join(cldr_dir, '..', 'supplemental',
+                                          'supplementalMetadata.xml'),
+                             'metadata/defaultContent'):
+        for data in ns[1:][0]:
+            if data[0] == u"locales":
+                defaultContent_locales += data[1].split()
+
+    skips = []
+    for file in defaultContent_locales:
+        try:
+            language_code, script_code, country_code = splitLocale(file)
+        except ValueError:
+            sys.stderr.write('skipping defaultContent locale "' + file + '" [neither two nor three tags]\n')
+            continue
+
+        if not (script_code or country_code):
+            sys.stderr.write('skipping defaultContent locale "' + file + '" [second tag is neither script nor territory]\n')
+            continue
+
+        try:
+            l = _generateLocaleInfo(cldr_dir + "/" + file + ".xml", language_code, script_code, country_code)
+            if not l:
+                skips.append(file)
+                continue
+        except xpathlite.Error as e:
+            sys.stderr.write('skipping defaultContent locale "{}" ({})\n'.format(file, str(e)))
+            continue
+
+        locale_database[(l.language_id, l.script_id, l.country_id, l.variant_code)] = l
+
+    if skips:
+        wrappedwarn(err, 'skipping defaultContent locales [no locale info generated]: ', skips)
+        skips = []
+
+    for file in cldr_files:
+        try:
+            l = generateLocaleInfo(cldr_dir + "/" + file)
+            if not l:
+                skips.append(file)
+                continue
+        except xpathlite.Error as e:
+            sys.stderr.write('skipping file "{}" ({})\n'.format(file, str(e)))
+            continue
+
+        locale_database[(l.language_id, l.script_id, l.country_id, l.variant_code)] = l
+
+    if skips:
+        wrappedwarn(err, 'skipping files [no locale info generated]: ', skips)
+
+    integrateWeekData(cldr_dir + "/../supplemental/supplementalData.xml", locale_database)
+    cldr_version = 'unknown'
+    with open(cldr_dir+"/../dtd/ldml.dtd", "r") as ldml:
+        for line in ldml:
+            if 'version cldrVersion CDATA #FIXED' in line:
+                cldr_version = line.split('"')[1]
+
+    xmlOut = QLocaleXmlWriter(qxml.write)
+    xmlOut.version(cldr_version)
+    xmlOut.enumData(enumdata.language_list,
+                    enumdata.script_list,
+                    enumdata.country_list)
+    xmlOut.likelySubTags(likelySubtags(os.path.split(cldr_dir)[0], err))
+    xmlOut.locales(locale_database, calendars)
+    xmlOut.close()
+    if qxml is not out:
+        qxml.close()
+
+    return 0
+
+if __name__ == '__main__':
+    import sys
+    sys.exit(main(sys.argv, sys.stdout, sys.stderr))
diff --git a/util/locale_database/qlocalexml.py b/util/locale_database/qlocalexml.py
index 87e356b8a7..b64ff56c64 100644
--- a/util/locale_database/qlocalexml.py
+++ b/util/locale_database/qlocalexml.py
@@ -1,7 +1,7 @@
 # coding=utf8
 #############################################################################
 ##
-## Copyright (C) 2018 The Qt Company Ltd.
+## Copyright (C) 2020 The Qt Company Ltd.
 ## Contact: https://www.qt.io/licensing/
 ##
 ## This file is part of the test suite of the Qt Toolkit.
@@ -28,11 +28,17 @@
 #############################################################################
 """Shared serialization-scanning code for QLocaleXML format.
 
-The Locale class is written by cldr2qlocalexml.py and read by qlocalexml2cpp.py
+Provides classes:
+  Locale -- common data-type representing one locale as a namespace
+  QLocaleXmlWriter -- helper to write a QLocaleXML file
+
+Support:
+  Spacer -- provides control over indentation of the output.
 """
+from __future__ import print_function
 from xml.sax.saxutils import escape
 
-import xpathlite
+from xpathlite import Error
 
 # Tools used by Locale:
 def camel(seq):
@@ -43,10 +49,14 @@ def camel(seq):
 def camelCase(words):
     return ''.join(camel(iter(words)))
 
+def addEscapes(s):
+    return ''.join(c if n < 128 else '\\x{:02x}'.format(n)
+                   for n, c in ((ord(c), c) for c in s))
+
 def ordStr(c):
     if len(c) == 1:
         return str(ord(c))
-    raise xpathlite.Error('Unable to handle value "%s"' % addEscapes(c))
+    raise Error('Unable to handle value "{}"'.format(addEscapes(c)))
 
 # Fix for a problem with QLocale returning a character instead of
 # strings for QLocale::exponential() and others. So we fallback to
@@ -69,6 +79,8 @@ def convertFormat(format):
     * https://www.unicode.org/reports/tr35/tr35-dates.html#Date_Field_Symbol_Table
     * QDateTimeParser::parseFormat() and QLocalePrivate::dateTimeToString()
     """
+    # Compare and contrast dateconverter.py's convert_date().
+    # Need to (check consistency and) reduce redundancy !
     result = ""
     i = 0
     while i < len(format):
@@ -113,7 +125,163 @@ def convertFormat(format):
 
     return result
 
-class Locale:
+class Spacer (object):
+    def __init__(self, indent = None, initial = ''):
+        """Prepare to manage indentation and line breaks.
+
+        Arguments are both optional.
+
+        First argument, indent, is either None (its default, for
+        'minifying'), an ingeter (number of spaces) or the unit of
+        text that is to be used for each indentation level (e.g. '\t'
+        to use tabs).  If indent is None, no indentation is added, nor
+        are line-breaks; otherwise, self(text), for non-empty text,
+        shall end with a newline and begin with indentation.
+
+        Second argument, initial, is the initial indentation; it is
+        ignored if indent is None.  Indentation increases after each
+        call to self(text) in which text starts with a tag and doesn't
+        include its end-tag; indentation decreases if text starts with
+        an end-tag.  The text is not parsed any more carefully than
+        just described.
+        """
+        if indent is None:
+            self.__call = lambda x: x
+        else:
+            self.__each = ' ' * indent if isinstance(indent, int) else indent
+            self.current = initial
+            self.__call = self.__wrap
+
+    def __wrap(self, line):
+        if not line:
+            return '\n'
+
+        indent = self.current
+        if line.startswith('</'):
+            indent = self.current = indent[:-len(self.__each)]
+        elif line.startswith('<') and not line.startswith('<!'):
+            cut = line.find('>')
+            tag = (line[1:] if cut < 0 else line[1 : cut]).strip().split()[0]
+            if '</{}>'.format(tag) not in line:
+                self.current += self.__each
+        return indent + line + '\n'
+
+    def __call__(self, line):
+        return self.__call(line)
+
+class QLocaleXmlWriter (object):
+    def __init__(self, save = None, space = Spacer(4)):
+        """Set up to write digested CLDR data as QLocale XML.
+
+        Arguments are both optional.
+
+        First argument, save, is None (its default) or a callable that
+        will write content to where you intend to save it. If None, it
+        is replaced with a callable that prints the given content,
+        suppressing the newline (but see the following); this is
+        equivalent to passing sys.stdout.write.
+
+        Second argument, space, is an object to call on each text
+        output to prepend indentation and append newlines, or not as
+        the case may be. The default is a Spacer(4), which grows
+        indent by four spaces after each unmatched new tag and shrinks
+        back on a close-tag (its parsing is naive, but adequate to how
+        this class uses it), while adding a newline to each line.
+        """
+        self.__rawOutput = self.__printit if save is None else save
+        self.__wrap = space
+        self.__write('<localeDatabase>')
+
+    # Output of various sections, in their usual order:
+    def enumData(self, languages, scripts, countries):
+        self.__enumTable('languageList', languages)
+        self.__enumTable('scriptList', scripts)
+        self.__enumTable('countryList', countries)
+
+    def likelySubTags(self, entries):
+        self.__openTag('likelySubtags')
+        for have, give in entries:
+            self.__openTag('likelySubtag')
+            self.__likelySubTag('from', have)
+            self.__likelySubTag('to', give)
+            self.__closeTag('likelySubtag')
+        self.__closeTag('likelySubtags')
+
+    def locales(self, locales, calendars):
+        self.__openTag('localeList')
+        self.__openTag('locale')
+        Locale.C(calendars).toXml(self.inTag, calendars)
+        self.__closeTag('locale')
+        keys = locales.keys()
+        keys.sort()
+        for key in keys:
+            self.__openTag('locale')
+            locales[key].toXml(self.inTag, calendars)
+            self.__closeTag('locale')
+        self.__closeTag('localeList')
+
+    def version(self, cldrVersion):
+        self.inTag('version', cldrVersion)
+
+    def inTag(self, tag, text):
+        self.__write('<{0}>{1}</{0}>'.format(tag, text))
+
+    def close(self):
+        if self.__rawOutput != self.__complain:
+            self.__write('</localeDatabase>')
+        self.__rawOutput = self.__complain
+
+    # Implementation details
+    @staticmethod
+    def __printit(text):
+        print(text, end='')
+    @staticmethod
+    def __complain(text):
+        raise Error('Attempted to write data after closing :-(')
+
+    def __enumTable(self, tag, table):
+        self.__openTag(tag)
+        for key, value in table.iteritems():
+            self.__openTag(tag[:-4])
+            self.inTag('name', value[0])
+            self.inTag('id', key)
+            self.inTag('code', value[1])
+            self.__closeTag(tag[:-4])
+        self.__closeTag(tag)
+
+    def __likelySubTag(self, tag, likely):
+        self.__openTag(tag)
+        self.inTag('language', likely[0])
+        self.inTag('script', likely[1])
+        self.inTag('country', likely[2])
+        # self.inTag('variant', likely[3])
+        self.__closeTag(tag)
+
+    def __openTag(self, tag):
+        self.__write('<{}>'.format(tag))
+    def __closeTag(self, tag):
+        self.__write('</{}>'.format(tag))
+
+    def __write(self, line):
+        self.__rawOutput(self.__wrap(line))
+
+class Locale (object):
+    """Holder for the assorted data representing one locale.
+
+    Implemented as a namespace; its constructor and update() have the
+    same signatures as those of a dict, acting on the instance's
+    __dict__, so the results are accessed as attributes rather than
+    mapping keys."""
+    def __init__(self, data=None, **kw):
+        self.update(data, **kw)
+
+    def update(self, data=None, **kw):
+        if data: self.__dict__.update(data)
+        if kw: self.__dict__.update(kw)
+
+    def __len__(self): # Used when testing as a boolean
+        return len(self.__dict__)
+
     @staticmethod
     def propsMonthDay(scale, lengths=('long', 'short', 'narrow')):
         for L in lengths:
@@ -176,19 +344,26 @@ class Locale:
 
         return cls(data)
 
-    def toXml(self, calendars=('gregorian',), indent='        ', tab='    '):
-        print indent + '<locale>'
-        inner = indent + tab
+    def toXml(self, write, calendars=('gregorian',)):
+        """Writes its data as QLocale XML.
+
+        First argument, write, is a callable taking the name and
+        content of an XML element; it is expected to be the inTag
+        bound method of a QLocaleXmlWriter instance.
+
+        Optional second argument is a list of calendar names, in the
+        form used by CLDR; its default is ('gregorian',).
+        """
         get = lambda k: getattr(self, k)
         for key in ('language', 'script', 'country'):
-            print inner + "<%s>" % key + get(key) + "</%s>" % key
-            print inner + "<%scode>" % key + get(key + '_code') + "</%scode>" % key
+            write(key, get(key))
+            write('{}code'.format(key), get('{}_code'.format(key)))
 
         for key in ('decimal', 'group', 'zero'):
-            print inner + "<%s>" % key + ordStr(get(key)) + "</%s>" % key
+            write(key, ordStr(get(key)))
         for key, std in (('list', ';'), ('percent', '%'),
                          ('minus', '-'), ('plus', '+'), ('exp', 'e')):
-            print inner + "<%s>" % key + fixOrdStr(get(key), std) + "</%s>" % key
+            write(key, fixOrdStr(get(key), std))
 
         for key in ('languageEndonym', 'countryEndonym',
                     'quotationStart', 'quotationEnd',
@@ -206,16 +381,10 @@ class Locale:
                 '_'.join((k, cal))
                 for k in self.propsMonthDay('months')
                 for cal in calendars):
-            print inner + "<%s>%s</%s>" % (key, escape(get(key)).encode('utf-8'), key)
+            write(key, escape(get(key)).encode('utf-8'))
 
         for key in ('currencyDigits', 'currencyRounding'):
-            print inner + "<%s>%d</%s>" % (key, get(key), key)
-
-        print indent + "</locale>"
-
-    def __init__(self, data=None, **kw):
-        if data: self.__dict__.update(data)
-        if kw: self.__dict__.update(kw)
+            write(key, get(key))
 
     # Tools used by __monthNames:
     def fullName(i, name): return name
@@ -261,8 +430,8 @@ class Locale:
         for cal in calendars:
             try:
                 data = known[cal]
-            except KeyError: # Need to add an entry to known, above.
-                print 'Unsupported calendar:', cal
+            except KeyError as e: # Need to add an entry to known, above.
+                e.args += ('Unsupported calendar:', cal)
                 raise
             names, get = data[0] + ('',), data[1:]
             for n, size in enumerate(sizes):
@@ -279,7 +448,7 @@ class Locale:
                   'Thursday', 'Friday', 'Saturday', ''),
           quantifiers=('k', 'M', 'G', 'T', 'P', 'E')):
         """Returns an object representing the C locale."""
-        return cls(dict(cls.__monthNames(calendars)),
+        return cls(cls.__monthNames(calendars),
                    language='C', language_code='0', languageEndonym='',
                    script='AnyScript', script_code='0',
                    country='AnyCountry', country_code='0', countryEndonym='',