Merge "Merge remote-tracking branch 'origin/5.15' into dev"

author: Qt Forward Merge Bot <qt_forward_merge_bot@qt-project.org> 2020-04-07 01:00:12 +0200
committer: Fabian Kosmale <fabian.kosmale@qt.io> 2020-04-08 22:04:23 +0200
commit: c937ed8af4f3dfef3fd8f8c2a9815376790dd5bf (patch)
tree: 5175aff87e160ae8f32dadc60d3cfd38b73d4fb1 /util
parent: e0346df1b21cb30b54ae8d4918addc9925fa8479 (diff)
parent: 8823bb8d306d78dd6a2e121a708dc607beff58c8 (diff)
8 files changed, 2511 insertions, 1942 deletions
diff --git a/util/locale_database/cldr.py b/util/locale_database/cldr.py
new file mode 100644
index 0000000000..4b54f50080
--- /dev/null
+++ b/util/locale_database/cldr.py
@@ -0,0 +1,718 @@
+#############################################################################
+##
+## Copyright (C) 2020 The Qt Company Ltd.
+## Contact: https://www.qt.io/licensing/
+##
+## This file is part of the test suite of the Qt Toolkit.
+##
+## $QT_BEGIN_LICENSE:GPL-EXCEPT$
+## Commercial License Usage
+## Licensees holding valid commercial Qt licenses may use this file in
+## accordance with the commercial license agreement provided with the
+## Software or, alternatively, in accordance with the terms contained in
+## a written agreement between you and The Qt Company. For licensing terms
+## and conditions see https://www.qt.io/terms-conditions. For further
+## information use the contact form at https://www.qt.io/contact-us.
+##
+## GNU General Public License Usage
+## Alternatively, this file may be used under the terms of the GNU
+## General Public License version 3 as published by the Free Software
+## Foundation with exceptions as appearing in the file LICENSE.GPL3-EXCEPT
+## included in the packaging of this file. Please review the following
+## information to ensure the GNU General Public License requirements will
+## be met: https://www.gnu.org/licenses/gpl-3.0.html.
+##
+## $QT_END_LICENSE$
+##
+#############################################################################
+"""Digesting the CLDR's data.
+
+Provides two classes:
+  CldrReader -- driver for reading CLDR data
+  CldrAccess -- used by the reader to access the tree of data files
+
+The former should normally be all you need to access.
+See individual classes for further detail.
+"""
+
+from xml.dom import minidom
+from weakref import WeakValueDictionary as CacheDict
+import os
+
+from ldml import Error, Node, XmlScanner, Supplement, LocaleScanner
+from qlocalexml import Locale
+
+class CldrReader (object):
+    def __init__(self, root, grumble = lambda msg: None, whitter = lambda msg: None):
+        """Set up a reader object for reading CLDR data.
+
+        Single parameter, root, is the file-system path to the root of
+        the unpacked CLDR archive; its common/ sub-directory should
+        contain dtd/, main/ and supplemental/ sub-directories.
+
+        Optional second argument, grumble, is a callable that logs
+        warnings and complaints, e.g. sys.stderr.write would be a
+        suitable callable.  The default is a no-op that ignores its
+        single argument.  Optional third argument is similar, used for
+        less interesting output; pass sys.stderr.write for it for
+        verbose output."""
+        self.root = CldrAccess(root)
+        self.whitter, self.grumble = whitter, grumble
+
+    def likelySubTags(self):
+        """Generator for likely subtag information.
+
+        Yields pairs (have, give) of 4-tuples; if what you have
+        matches the left member, giving the right member is probably
+        sensible. Each 4-tuple's entries are the full names of a
+        language, a script, a country (strictly territory) and a
+        variant (currently ignored)."""
+        skips = []
+        for got, use in self.root.likelySubTags():
+            try:
+                have = self.__parseTags(got)
+                give = self.__parseTags(use)
+            except Error as e:
+                if ((use.startswith(got) or got.startswith('und_'))
+                    and e.message.startswith('Unknown ') and ' code ' in e.message):
+                    skips.append(use)
+                else:
+                    self.grumble('Skipping likelySubtag "{}" -> "{}" ({})\n'.format(got, use, e.message))
+                continue
+            if all(code.startswith('Any') and code[3].isupper() for code in have[:-1]):
+                continue
+
+            give = (give[0],
+                    # Substitute according to http://www.unicode.org/reports/tr35/#Likely_Subtags
+                    have[1] if give[1] == 'AnyScript' else give[1],
+                    have[2] if give[2] == 'AnyCountry' else give[2],
+                    give[3]) # AnyVariant similarly ?
+
+            yield have, give
+
+        if skips:
+            # TODO: look at LDML's reserved locale tag names; they
+            # show up a lot in this, and may be grounds for filtering
+            # more out.
+            pass # self.__wrapped(self.whitter, 'Skipping likelySubtags (for unknown codes): ', skips)
+
+    def readLocales(self, calendars = ('gregorian',)):
+        locales = tuple(self.__allLocales(calendars))
+        return dict(((k.language_id, k.script_id, k.country_id, k.variant_code),
+                     k) for k in locales)
+
+    def __allLocales(self, calendars):
+        def skip(locale, reason):
+            return 'Skipping defaultContent locale "{}" ({})\n'.format(locale, reason)
+
+        for locale in self.root.defaultContentLocales:
+            try:
+                language, script, country, variant = self.__splitLocale(locale)
+            except ValueError:
+                self.whitter(skip(locale, 'only language tag'))
+                continue
+
+            if not (script or country):
+                self.grumble(skip(locale, 'second tag is neither script nor territory'))
+                continue
+
+            if not (language and country):
+                continue
+
+            try:
+                yield self.__getLocaleData(self.root.locale(locale), calendars,
+                                           language, script, country, variant)
+            except Error as e:
+                self.grumble(skip(locale, e.message))
+
+        for locale in self.root.fileLocales:
+            try:
+                chain = self.root.locale(locale)
+                language, script, country, variant = chain.tagCodes()
+                assert language
+                # TODO: this skip should probably be based on likely
+                # sub-tags, instead of empty country: if locale has a
+                # likely-subtag expansion, that's what QLocale uses,
+                # and we'll be saving its data for the expanded locale
+                # anyway, so don't need to record it for itself.
+                # See also QLocaleXmlReader.loadLocaleMap's grumble.
+                if not country:
+                    continue
+                yield self.__getLocaleData(chain, calendars, language, script, country, variant)
+            except Error as e:
+                self.grumble('Skipping file locale "{}" ({})\n'.format(locale, e.message))
+
+    import textwrap
+    @staticmethod
+    def __wrapped(writer, prefix, tokens, wrap = textwrap.wrap):
+        writer('\n'.join(wrap(prefix + ', '.join(tokens),
+                              subsequent_indent=' ', width=80)) + '\n')
+    del textwrap
+
+    def __parseTags(self, locale):
+        tags = self.__splitLocale(locale)
+        language = tags.next()
+        script = country = variant = ''
+        try:
+            script, country, variant = tags
+        except ValueError:
+            pass
+        return tuple(p[1] for p in self.root.codesToIdName(language, script, country, variant))
+
+    def __splitLocale(self, name):
+        """Generate (language, script, territory, variant) from a locale name
+
+        Ignores any trailing fields (with a warning), leaves script (a
+        capitalised four-letter token), territory (either a number or
+        an all-uppercase token) or variant (upper case and digits)
+        empty if unspecified.  Only generates one entry if name is a
+        single tag (i.e. contains no underscores).  Always yields 1 or
+        4 values, never 2 or 3."""
+        tags = iter(name.split('_'))
+        yield tags.next() # Language
+        tag = tags.next() # may raise StopIteration
+
+        # Script is always four letters, always capitalised:
+        if len(tag) == 4 and tag[0].isupper() and tag[1:].islower():
+            yield tag
+            try:
+                tag = tags.next()
+            except StopIteration:
+                tag = ''
+        else:
+            yield ''
+
+        # Territory is upper-case or numeric:
+        if tag and tag.isupper() or tag.isdigit():
+            yield tag
+            try:
+                tag = tags.next()
+            except StopIteration:
+                tag = ''
+        else:
+            yield ''
+
+        # Variant can be any mixture of upper-case and digits.
+        if tag and all(c.isupper() or c.isdigit() for c in tag):
+            yield tag
+            tag = ''
+        else:
+            yield ''
+
+        # If nothing is left, StopIteration will avoid the warning:
+        if not tag:
+            tag = tags.next()
+        self.grumble('Ignoring unparsed cruft {} in {}\n'.format('_'.join(tag + tuple(tags)), name))
+
+    def __getLocaleData(self, scan, calendars, language, script, country, variant):
+        ids, names = zip(*self.root.codesToIdName(language, script, country, variant))
+        assert ids[0] > 0 and ids[2] > 0, (language, script, country, variant)
+        locale = Locale(
+            language = names[0], language_code = language, language_id = ids[0],
+            script = names[1], script_code = script, script_id = ids[1],
+            country = names[2], country_code = country, country_id = ids[2],
+            variant_code = variant)
+
+        firstDay, weStart, weEnd = self.root.weekData(country)
+        assert all(day in ('mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun')
+                   for day in (firstDay, weStart, weEnd))
+
+        locale.update(firstDayOfWeek = firstDay,
+                      weekendStart = weStart,
+                      weekendEnd = weEnd)
+
+        iso, digits, rounding = self.root.currencyData(country)
+        locale.update(currencyIsoCode = iso,
+                      currencyDigits = int(digits),
+                      currencyRounding = int(rounding))
+
+        locale.update(scan.currencyData(iso))
+        locale.update(scan.numericData(self.root.numberSystem, self.whitter))
+        locale.update(scan.textPatternData())
+        locale.update(scan.endonyms(language, script, country, variant))
+        locale.update(scan.unitData()) # byte, kB, MB, GB, ..., KiB, MiB, GiB, ...
+        locale.update(scan.calendarNames(calendars)) # Names of days and months
+
+        return locale
+
+# Note: various caches assume this class is a singleton, so the
+# "default" value for a parameter no caller should pass can serve as
+# the cache. If a process were to instantiate this class with distinct
+# roots, each cache would be filled by the first to need it !
+class CldrAccess (object):
+    def __init__(self, root):
+        """Set up a master object for accessing CLDR data.
+
+        Single parameter, root, is the file-system path to the root of
+        the unpacked CLDR archive; its common/ sub-directory should
+        contain dtd/, main/ and supplemental/ sub-directories."""
+        self.root = root
+
+    def xml(self, *path):
+        """Load a single XML file and return its root element as an XmlScanner.
+
+        The path is interpreted relative to self.root"""
+        return XmlScanner(Node(self.__xml(path)))
+
+    def supplement(self, name):
+        """Loads supplemental data as a Supplement object.
+
+        The name should be that of a file in common/supplemental/, without path.
+        """
+        return Supplement(Node(self.__xml(('common', 'supplemental', name))))
+
+    def locale(self, name):
+        """Loads all data for a locale as a LocaleScanner object.
+
+        The name should be a locale name; adding suffix '.xml' to it
+        should usually yield a file in common/main/.  The returned
+        LocaleScanner object packages this file along with all those
+        from which it inherits; its methods know how to handle that
+        inheritance, where relevant."""
+        return LocaleScanner(name, self.__localeRoots(name), self.__rootLocale)
+
+    @property
+    def fileLocales(self, joinPath = os.path.join, listDirectory = os.listdir,
+                    splitExtension = os.path.splitext):
+        """Generator for locale IDs seen in file-names.
+
+        All *.xml other than root.xml in common/main/ are assumed to
+        identify locales."""
+        for name in listDirectory(joinPath(self.root, 'common', 'main')):
+            stem, ext = splitExtension(name)
+            if ext == '.xml' and stem != 'root':
+                yield stem
+
+    @property
+    def defaultContentLocales(self):
+        """Generator for the default content locales."""
+        for name, attrs in self.supplement('supplementalMetadata.xml').find('metadata/defaultContent'):
+            try:
+                locales = attrs['locales']
+            except KeyError:
+                pass
+            else:
+                for locale in locales.split():
+                    yield locale
+
+    def likelySubTags(self):
+        for ignore, attrs in self.supplement('likelySubtags.xml').find('likelySubtags'):
+            yield attrs['from'], attrs['to']
+
+    def numberSystem(self, system):
+        """Get a description of a numbering system.
+
+        Returns a mapping, with keys u'digits', u'type' and u'id'; the
+        value for this last is system. Raises KeyError for unknown
+        number system, ldml.Error on failure to load data."""
+        try:
+            return self.__numberSystems[system]
+        except KeyError:
+            raise Error('Unsupported number system: {}'.format(system))
+
+    def weekData(self, country):
+        """Data on the weekly cycle.
+
+        Returns a triple (W, S, E) of en's short names for week-days;
+        W is the first day of the week, S the start of the week-end
+        and E the end of the week-end.  Where data for a country is
+        unavailable, the data for CLDR's territory 001 (The World) is
+        used."""
+        try:
+            return self.__weekData[country]
+        except KeyError:
+            return self.__weekData['001']
+
+    def currencyData(self, country):
+        """Returns currency data for the given country code.
+
+        Return value is a tuple (ISO4217 code, digit count, rounding
+        mode).  If CLDR provides no data for this country, ('', 2, 1)
+        is the default result.
+        """
+        try:
+            return self.__currencyData[country]
+        except KeyError:
+            return '', 2, 1
+
+    def codesToIdName(self, language, script, country, variant = ''):
+        """Maps each code to the appropriate ID and name.
+
+        Returns a 4-tuple of (ID, name) pairs corresponding to the
+        language, script, country and variant given.  Raises a
+        suitable error if any of them is unknown, indicating all that
+        are unknown plus suitable names for any that could sensibly be
+        added to enumdata.py to make them known.
+
+        Until we implement variant support (QTBUG-81051), the fourth
+        member of the returned tuple is always 0 paired with a string
+        that should not be used."""
+        enum = self.__enumMap
+        try:
+            return (enum('language')[language],
+                    enum('script')[script],
+                    enum('country')[country],
+                    enum('variant')[variant])
+        except KeyError:
+            pass
+
+        parts, values = [], [language, script, country, variant]
+        for index, key in enumerate(('language', 'script', 'country', 'variant')):
+            naming, enums = self.__codeMap(key), enum(key)
+            value = values[index]
+            if value not in enums:
+                text = '{} code {}'.format(key, value)
+                name = naming.get(value)
+                if name and value != 'POSIX':
+                    text += u' (could add {})'.format(name)
+                parts.append(text)
+        if len(parts) > 1:
+            parts[-1] = 'and ' + parts[-1]
+        assert parts
+        raise Error('Unknown ' + ', '.join(parts),
+                    language, script, country, variant)
+
+    def readWindowsTimeZones(self, lookup): # For use by cldr2qtimezone.py
+        """Digest CLDR's MS-Win time-zone name mapping.
+
+        MS-Win have their own eccentric names for time-zones.  CLDR
+        helpfully provides a translation to more orthodox names.
+
+        Singe argument, lookup, is a mapping from known MS-Win names
+        for locales to a unique integer index (starting at 1).
+
+        The XML structure we read has the form:
+
+ <supplementalData>
+     <windowsZones>
+         <mapTimezones otherVersion="..." typeVersion="...">
+             <!-- (UTC-08:00) Pacific Time (US & Canada) -->
+             <mapZone other="Pacific Standard Time" territory="001" type="America/Los_Angeles"/>
+             <mapZone other="Pacific Standard Time" territory="CA" type="America/Vancouver America/Dawson America/Whitehorse"/>
+             <mapZone other="Pacific Standard Time" territory="US" type="America/Los_Angeles America/Metlakatla"/>
+             <mapZone other="Pacific Standard Time" territory="ZZ" type="PST8PDT"/>
+         </mapTimezones>
+     </windowsZones>
+ </supplementalData>
+"""
+        zones = self.supplement('windowsZones.xml')
+        enum = self.__enumMap('country')
+        badZones, unLands, defaults, windows = set(), set(), {}, {}
+
+        for name, attrs in zones.find('windowsZones/mapTimezones'):
+            if name != 'mapZone':
+                continue
+
+            wid, code = attrs['other'], attrs['territory']
+            data = dict(windowsId = wid,
+                        countryCode = code,
+                        ianaList = attrs['type'])
+
+            try:
+                key = lookup[wid]
+            except KeyError:
+                badZones.add(wid)
+                key = 0
+            data['windowsKey'] = key
+
+            if code == u'001':
+                defaults[key] = data['ianaList']
+            else:
+                try:
+                    cid, name = enum[code]
+                except KeyError:
+                    unLands.append(code)
+                    continue
+                data.update(countryId = cid, country = name)
+                windows[key, cid] = data
+
+        if unLands:
+            raise Error('Unknown country codes, please add to enumdata.py: '
+                        + ', '.join(sorted(unLands)))
+
+        if badZones:
+            raise Error('Unknown Windows IDs, please add to cldr2qtimezone.py: '
+                        + ', '.join(sorted(badZones)))
+
+        return self.cldrVersion, defaults, windows
+
+    @property
+    def cldrVersion(self):
+        # Evaluate so as to ensure __cldrVersion is set:
+        self.__unDistinguishedAttributes
+        return self.__cldrVersion
+
+    # Implementation details
+    def __xml(self, path, cache = CacheDict(), read = minidom.parse, joinPath = os.path.join):
+        try:
+            doc = cache[path]
+        except KeyError:
+            cache[path] = doc = read(joinPath(self.root, *path)).documentElement
+        return doc
+
+    def __open(self, path, joinPath=os.path.join):
+        return open(joinPath(self.root, *path))
+
+    @property
+    def __rootLocale(self, cache = []):
+        if not cache:
+            cache.append(self.xml('common', 'main', 'root.xml'))
+        return cache[0]
+
+    @property
+    def __supplementalData(self, cache = []):
+        if not cache:
+            cache.append(self.supplement('supplementalData.xml'))
+        return cache[0]
+
+    @property
+    def __numberSystems(self, cache = {}, joinPath=os.path.join):
+        if not cache:
+            for ignore, attrs in self.supplement('numberingSystems.xml').find('numberingSystems'):
+                cache[attrs['id']] = attrs
+            assert cache
+        return cache
+
+    @property
+    def __weekData(self, cache = {}):
+        if not cache:
+            firstDay, weStart, weEnd = self.__getWeekData()
+            # Massage those into an easily-consulted form:
+            # World defaults given for code '001':
+            mon, sat, sun = firstDay['001'], weStart['001'], weEnd['001']
+            lands = set(firstDay) | set(weStart) | set(weEnd)
+            cache.update((land,
+                          (firstDay.get(land, mon), weStart.get(land, sat), weEnd.get(land, sun)))
+                         for land in lands)
+            assert cache
+        return cache
+
+    def __getWeekData(self):
+        """Scan for data on the weekly cycle.
+
+        Yields three mappings from locales to en's short names for
+        week-days; if a locale isn't a key of a given mapping, it
+        should use the '001' (world) locale's value. The first mapping
+        gives the day on which the week starts, the second gives the
+        day on which the week-end starts, the third gives the last day
+        of the week-end."""
+        source = self.__supplementalData
+        for key in ('firstDay', 'weekendStart', 'weekendEnd'):
+            result = {}
+            for ignore, attrs in source.find('weekData/' + key):
+                assert ignore == key
+                day = attrs['day']
+                assert day in ('mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun'), day
+                if 'alt' in attrs:
+                    continue
+                for loc in attrs.get('territories', '').split():
+                    result[loc] = day
+            yield result
+
+    @property
+    def __currencyData(self, cache = {}):
+        if not cache:
+            source = self.__supplementalData
+            for elt in source.findNodes('currencyData/region'):
+                iso, digits, rounding = '', 2, 1
+                try:
+                    country = elt.dom.attributes['iso3166'].nodeValue
+                except KeyError:
+                    continue
+                for child in elt.findAllChildren('currency'):
+                    try:
+                        if child.dom.attributes['tender'].nodeValue == 'false':
+                            continue
+                    except KeyError:
+                        pass
+                    try:
+                        child.dom.attributes['to'] # Is set if this element has gone out of date.
+                    except KeyError:
+                        iso = child.dom.attributes['iso4217'].nodeValue
+                        break
+                if iso:
+                    for tag, data in source.find(
+                        'currencyData/fractions/info[iso4217={}]'.format(iso)):
+                        digits = data['digits']
+                        rounding = data['rounding']
+                cache[country] = iso, digits, rounding
+            assert cache
+
+        return cache
+
+    @property
+    def __unDistinguishedAttributes(self, cache = {}, joinPath = os.path.join):
+        """Mapping from tag names to lists of attributes.
+
+        LDML defines some attributes as 'distinguishing': if a node
+        has distinguishing attributes that weren't specified in an
+        XPath, a search on that XPath should exclude the node's
+        children.
+
+        This property is a mapping from tag names to tuples of
+        attribute names that *aren't* distinguishing for that tag.
+        Its value is cached (so its costly computation isonly done
+        once) and there's a side-effect of populating its cache: it
+        sets self.__cldrVersion to the value found in ldml.dtd, during
+        parsing."""
+        if not cache:
+            cache.update(self.__scanLdmlDtd())
+            assert cache
+
+        return cache
+
+    def __scanLdmlDtd(self, joinPath = os.path.join):
+        """Scan the LDML DTD, record CLDR version
+
+        Yields (tag, attrs) pairs: on elements with a given tag,
+        attributes named in its attrs (a tuple) may be ignored in an
+        XPath search; other attributes are distinguished attributes,
+        in the terminology of LDML's locale-inheritance rules.
+
+        Sets self.__cldrVersion as a side-effect, since this
+        information is found in the same file."""
+        with self.__open(('common', 'dtd', 'ldml.dtd')) as dtd:
+            tag, ignored, last = None, None, None
+
+            for line in dtd:
+                if line.startswith('<!ELEMENT '):
+                    if ignored:
+                        assert tag
+                        yield tag, tuple(ignored)
+                    tag, ignored, last = line.split()[1], [], None
+                    continue
+
+                if line.startswith('<!ATTLIST '):
+                    assert tag is not None
+                    parts = line.split()
+                    assert parts[1] == tag
+                    last = parts[2]
+                    if parts[1:5] == ['version', 'cldrVersion', 'CDATA', '#FIXED']:
+                        # parts[5] is the version, in quotes, although the final > might be stuck on its end:
+                        self.__cldrVersion = parts[5].split('"')[1]
+                    continue
+
+                # <!ELEMENT...>s can also be @METADATA, but not @VALUE:
+                if '<!--@VALUE-->' in line or (last and '<!--@METADATA-->' in line):
+                    assert last is not None
+                    assert ignored is not None
+                    assert tag is not None
+                    ignored.append(last)
+                    last = None # No attribute is both value and metadata
+
+            if tag and ignored:
+                yield tag, tuple(ignored)
+
+    def __enumMap(self, key, cache = {}):
+        if not cache:
+            cache['variant'] = {'': (0, 'This should never be seen outside ldml.py')}
+            # They're not actually lists: mappings from numeric value
+            # to pairs of full name and short code. What we want, in
+            # each case, is a mapping from code to the other two.
+            from enumdata import language_list, script_list, country_list
+            for form, book, empty in (('language', language_list, 'AnyLanguage'),
+                                      ('script', script_list, 'AnyScript'),
+                                      ('country', country_list, 'AnyCountry')):
+                cache[form] = dict((pair[1], (num, pair[0]))
+                                   for num, pair in book.items() if pair[0] != 'C')
+                # (Have to filter out the C locale, as we give it the
+                # same (all space) code as AnyLanguage, whose code
+                # should probably be 'und' instead.)
+
+                # Map empty to zero and the any value:
+                cache[form][''] = (0, empty)
+            # and map language code 'und' also to (0, any):
+            cache['language']['und'] = (0, 'AnyLanguage')
+
+        return cache[key]
+
+    def __codeMap(self, key, cache = {},
+                  # Maps our name for it to CLDR's name:
+                  naming = {'language': 'languages', 'script': 'scripts',
+                            'country': 'territories', 'variant': 'variants'}):
+        if not cache:
+            root = self.xml('common', 'main', 'en.xml').root.findUniqueChild('localeDisplayNames')
+            for dst, src in naming.items():
+                cache[dst] = dict(self.__codeMapScan(root.findUniqueChild(src)))
+            assert cache
+
+        return cache[key]
+
+    def __codeMapScan(self, node):
+        """Get mapping from codes to element values.
+
+        Passed in node is a <languages>, <scripts>, <territories> or
+        <variants> node, each child of which is a <language>,
+        <script>, <territory> or <variant> node as appropriate, whose
+        type is a code (of the appropriate flavour) and content is its
+        full name.  In some cases, two child nodes have the same type;
+        in these cases, one always has an alt attribute and we should
+        prefer the other.  Yields all such type, content pairs found
+        in node's children (skipping any with an alt attribute, if
+        their type has been seen previously)."""
+        seen = set()
+        for elt in node.dom.childNodes:
+            try:
+                key, value = elt.attributes['type'].nodeValue, elt.childNodes[0].wholeText
+            except (KeyError, ValueError, TypeError):
+                pass
+            else:
+                if key not in seen or not elt.attributes.has_key('alt'):
+                    yield key, value
+                    seen.add(key)
+
+    # CLDR uses inheritance between locales to save repetition:
+    def __parentLocale(self, name, cache = {}):
+        # see http://www.unicode.org/reports/tr35/#Parent_Locales
+        if not cache:
+            for tag, attrs in self.__supplementalData.find('parentLocales'):
+                parent = attrs.get('parent', '')
+                for child in attrs['locales'].split():
+                    cache[child] = parent
+            assert cache
+
+        return cache[name]
+
+    def __localeAsDoc(self, name, aliasFor = None,
+                      joinPath = os.path.join, exists = os.path.isfile):
+        path = ('common', 'main', name + '.xml')
+        if exists(joinPath(self.root, *path)):
+            elt = self.__xml(path)
+            for child in Node(elt).findAllChildren('alias'):
+                try:
+                    alias = child.dom.attributes['source'].nodeValue
+                except (KeyError, AttributeError):
+                    pass
+                else:
+                    return self.__localeAsDoc(alias, aliasFor or name)
+            # No alias child with a source:
+            return elt
+
+        if aliasFor:
+            raise Error('Fatal error: found an alias "{}" -> "{}", but found no file for the alias'
+                        .format(aliasFor, name))
+
+    def __scanLocaleRoots(self, name):
+        while name and name != 'root':
+            doc = self.__localeAsDoc(name)
+            if doc is not None:
+                yield Node(doc, self.__unDistinguishedAttributes)
+
+            try:
+                name = self.__parentLocale(name)
+            except KeyError:
+                try:
+                    name, tail = name.rsplit('_', 1)
+                except ValueError: # No tail to discard: we're done
+                    break
+
+    class __Seq (list): pass # No weakref for tuple and list, but list sub-class is ok.
+    def __localeRoots(self, name, cache = CacheDict()):
+        try:
+            chain = cache[name]
+        except KeyError:
+            cache[name] = chain = self.__Seq(self.__scanLocaleRoots(name))
+        return chain
+
+# Unpolute the namespace: we don't need to export these.
+del minidom, CacheDict, os
diff --git a/util/locale_database/cldr2qlocalexml.py b/util/locale_database/cldr2qlocalexml.py
index 7f98e29d47..c05cabf520 100755
--- a/util/locale_database/cldr2qlocalexml.py
+++ b/util/locale_database/cldr2qlocalexml.py
@@ -31,15 +31,17 @@
 
 The CLDR data can be downloaded from CLDR_, which has a sub-directory
 for each version; you need the ``core.zip`` file for your version of
-choice (typically the latest).  This script has had updates to cope up
-to v35; for later versions, we may need adaptations.  Unpack the
+choice (typically the latest). This script has had updates to cope up
+to v35; for later versions, we may need adaptations. Unpack the
 downloaded ``core.zip`` and check it has a common/main/ sub-directory:
-pass the path of that sub-directory to this script as its single
-command-line argument.  Save its standard output (but not error) to a
-file for later processing by ``./qlocalexml2cpp.py``
+pass the path of that root of the download to this script as its first
+command-line argument. Pass the name of the file in which to write
+output as the second argument; either omit it or use '-' to select the
+standard output. This file is the input needed by
+``./qlocalexml2cpp.py``
 
 When you update the CLDR data, be sure to also update
-src/corelib/text/qt_attribution.json's entry for unicode-cldr.  Check
+src/corelib/text/qt_attribution.json's entry for unicode-cldr. Check
 this script's output for unknown language, country or script messages;
 if any can be resolved, use their entry in common/main/en.xml to
 append new entries to enumdata.py's lists and update documentation in
@@ -54,646 +56,67 @@ time zone names; see cldr2qtimezone.py for details.
 
 import os
 import sys
-import re
-import textwrap
 
-import enumdata
-import xpathlite
-from xpathlite import DraftResolution, findAlias, findEntry, findTagsInFile
-from dateconverter import convert_date
-from qlocalexml import Locale
-
-# TODO: make calendars a command-line option
-calendars = ['gregorian', 'persian', 'islamic'] # 'hebrew'
-findEntryInFile = xpathlite._findEntryInFile
-def wrappedwarn(prefix, tokens):
-    return sys.stderr.write(
-        '\n'.join(textwrap.wrap(prefix + ', '.join(tokens),
-                                subsequent_indent=' ', width=80)) + '\n')
-
-def parse_number_format(patterns, data):
-    # this is a very limited parsing of the number format for currency only.
-    def skip_repeating_pattern(x):
-        p = x.replace('0', '#').replace(',', '').replace('.', '')
-        seen = False
-        result = ''
-        for c in p:
-            if c == '#':
-                if seen:
-                    continue
-                seen = True
-            else:
-                seen = False
-            result = result + c
-        return result
-    patterns = patterns.split(';')
-    result = []
-    for pattern in patterns:
-        pattern = skip_repeating_pattern(pattern)
-        pattern = pattern.replace('#', "%1")
-        # according to http://www.unicode.org/reports/tr35/#Number_Format_Patterns
-        # there can be doubled or trippled currency sign, however none of the
-        # locales use that.
-        pattern = pattern.replace(u'\xa4', "%2")
-        pattern = pattern.replace("''", "###").replace("'", '').replace("###", "'")
-        pattern = pattern.replace('-', data['minus'])
-        pattern = pattern.replace('+', data['plus'])
-        result.append(pattern)
-    return result
-
-def raiseUnknownCode(code, form, cache={}):
-    """Check whether an unknown code could be supported.
-
-    We declare a language, script or country code unknown if it's not
-    known to enumdata.py; however, if it's present in main/en.xml's
-    mapping of codes to names, we have the option of adding support.
-    This caches the necessary look-up (so we only read main/en.xml
-    once) and returns the name we should use if we do add support.
-
-    First parameter, code, is the unknown code.  Second parameter,
-    form, is one of 'language', 'script' or 'country' to select the
-    type of code to look up.  Do not pass further parameters (the next
-    will deprive you of the cache).
-
-    Raises xpathlite.Error with a suitable message, that includes the
-    unknown code's full name if found.
-
-    Relies on global cldr_dir being set before it's called; see tail
-    of this file.
-    """
-    if not cache:
-        cache.update(xpathlite.codeMapsFromFile(os.path.join(cldr_dir, 'en.xml')))
-    name = cache[form].get(code)
-    msg = 'unknown %s code "%s"' % (form, code)
-    if name:
-        msg += ' - could use "%s"' % name
-    raise xpathlite.Error(msg)
-
-def parse_list_pattern_part_format(pattern):
-    # This is a very limited parsing of the format for list pattern part only.
-    return pattern.replace("{0}", "%1").replace("{1}", "%2").replace("{2}", "%3")
-
-def unit_quantifiers(find, path, stem, suffix, known,
-                     # Stop at exa/exbi: 16 exbi = 2^{64} < zetta =
-                     # 1000^7 < zebi = 2^{70}, the next quantifiers up:
-                     si_quantifiers = ('kilo', 'mega', 'giga', 'tera', 'peta', 'exa')):
-    """Work out the unit quantifiers.
-
-    Unfortunately, the CLDR data only go up to terabytes and we want
-    all the way to exabytes; but we can recognize the SI quantifiers
-    as prefixes, strip and identify the tail as the localized
-    translation for 'B' (e.g. French has 'octet' for 'byte' and uses
-    ko, Mo, Go, To from which we can extrapolate Po, Eo).
-
-    Should be called first for the SI quantifiers, with suffix = 'B',
-    then for the IEC ones, with suffix = 'iB'; the list known
-    (initially empty before first call) is used to let the second call
-    know what the first learned about the localized unit.
-    """
-    if suffix == 'B': # first call, known = []
-        tail = suffix
-        for q in si_quantifiers:
-            it = find(path, stem % q)
-            # kB for kilobyte, in contrast with KiB for IEC:
-            q = q[0] if q == 'kilo' else q[0].upper()
-            if not it:
-                it = q + tail
-            elif it.startswith(q):
-                rest = it[1:]
-                tail = rest if all(rest == k for k in known) else suffix
-                known.append(rest)
-            yield it
-    else: # second call, re-using first's known
-        assert suffix == 'iB'
-        if known:
-            byte = known.pop()
-            if all(byte == k for k in known):
-                suffix = 'i' + byte
-        for q in si_quantifiers:
-            yield find(path, stem % q[:2],
-                       # Those don't (yet, v31) exist in CLDR, so we always fall back to:
-                       q[0].upper() + suffix)
-
-def generateLocaleInfo(path):
-    if not path.endswith(".xml"):
-        return {}
-
-    # skip legacy/compatibility ones
-    alias = findAlias(path)
-    if alias:
-        raise xpathlite.Error('alias to "%s"' % alias)
-
-    def code(tag):
-        return findEntryInFile(path, 'identity/' + tag, attribute="type")[0]
-
-    return _generateLocaleInfo(path, code('language'), code('script'),
-                               code('territory'), code('variant'))
-
-def getNumberSystems(cache={}):
-    """Cached look-up of number system information.
-
-    Pass no arguments.  Returns a mapping from number system names to,
-    for each system, a mapping with keys u'digits', u'type' and
-    u'id'\n"""
-    if not cache:
-        for ns in findTagsInFile(os.path.join(cldr_dir, '..', 'supplemental',
-                                              'numberingSystems.xml'),
-                                 'numberingSystems'):
-            # ns has form: [u'numberingSystem', [(u'digits', u'0123456789'), (u'type', u'numeric'), (u'id', u'latn')]]
-            entry = dict(ns[1])
-            cache[entry[u'id']] = entry
-    return cache
-
-def _generateLocaleInfo(path, language_code, script_code, country_code, variant_code=""):
-    if not path.endswith(".xml"):
-        return {}
-
-    if language_code == 'root':
-        # just skip it
-        return {}
-
-    # we do not support variants
-    # ### actually there is only one locale with variant: en_US_POSIX
-    #     does anybody care about it at all?
-    if variant_code:
-        raise xpathlite.Error('we do not support variants ("%s")' % variant_code)
-
-    language_id = enumdata.languageCodeToId(language_code)
-    if language_id <= 0:
-        raiseUnknownCode(language_code, 'language')
-
-    script_id = enumdata.scriptCodeToId(script_code)
-    if script_id == -1:
-        raiseUnknownCode(script_code, 'script')
-
-    # we should handle fully qualified names with the territory
-    if not country_code:
-        return {}
-    country_id = enumdata.countryCodeToId(country_code)
-    if country_id <= 0:
-        raiseUnknownCode(country_code, 'country')
-
-    # So we say we accept only those values that have "contributed" or
-    # "approved" resolution. see http://www.unicode.org/cldr/process.html
-    # But we only respect the resolution for new datas for backward
-    # compatibility.
-    draft = DraftResolution.contributed
-
-    result = dict(
-        language=enumdata.language_list[language_id][0],
-        language_code=language_code, language_id=language_id,
-        script=enumdata.script_list[script_id][0],
-        script_code=script_code, script_id=script_id,
-        country=enumdata.country_list[country_id][0],
-        country_code=country_code, country_id=country_id,
-        variant_code=variant_code)
-
-    (dir_name, file_name) = os.path.split(path)
-    def from_supplement(tag,
-                        path=os.path.join(dir_name, '..', 'supplemental',
-                                          'supplementalData.xml')):
-        return findTagsInFile(path, tag)
-    currencies = from_supplement('currencyData/region[iso3166=%s]' % country_code)
-    result['currencyIsoCode'] = ''
-    result['currencyDigits'] = 2
-    result['currencyRounding'] = 1
-    if currencies:
-        for e in currencies:
-            if e[0] == 'currency':
-                t = [x[1] == 'false' for x in e[1] if x[0] == 'tender']
-                if t and t[0]:
-                    pass
-                elif not any(x[0] == 'to' for x in e[1]):
-                    result['currencyIsoCode'] = (x[1] for x in e[1] if x[0] == 'iso4217').next()
-                    break
-        if result['currencyIsoCode']:
-            t = from_supplement("currencyData/fractions/info[iso4217=%s]"
-                                % result['currencyIsoCode'])
-            if t and t[0][0] == 'info':
-                result['currencyDigits'] = (int(x[1]) for x in t[0][1] if x[0] == 'digits').next()
-                result['currencyRounding'] = (int(x[1]) for x in t[0][1] if x[0] == 'rounding').next()
-    numbering_system = None
-    try:
-        numbering_system = findEntry(path, "numbers/defaultNumberingSystem")
-    except xpathlite.Error:
-        pass
-    def findEntryDef(path, xpath, value=''):
-        try:
-            return findEntry(path, xpath)
-        except xpathlite.Error:
-            return value
-    def get_number_in_system(path, xpath, numbering_system):
-        if numbering_system:
-            try:
-                return findEntry(path, xpath + "[numberSystem=" + numbering_system + "]")
-            except xpathlite.Error:
-                # in CLDR 1.9 number system was refactored for numbers (but not for currency)
-                # so if previous findEntry doesn't work we should try this:
-                try:
-                    return findEntry(path, xpath.replace("/symbols/", "/symbols[numberSystem=" + numbering_system + "]/"))
-                except xpathlite.Error:
-                    # fallback to default
-                    pass
-        return findEntry(path, xpath)
-
-    result['decimal'] = get_number_in_system(path, "numbers/symbols/decimal", numbering_system)
-    result['group'] = get_number_in_system(path, "numbers/symbols/group", numbering_system)
-    assert result['decimal'] != result['group']
-    result['list'] = get_number_in_system(path, "numbers/symbols/list", numbering_system)
-    result['percent'] = get_number_in_system(path, "numbers/symbols/percentSign", numbering_system)
-    try:
-        digits = getNumberSystems()[numbering_system][u"digits"];
-        assert len(digits) == 10 and all(ord(d) - i == ord(digits[0]) for i, d in enumerate(digits))
-        result['zero'] = digits[0]
-    except Exception as e:
-        sys.stderr.write("Native zero detection problem: %s\n" % repr(e))
-        result['zero'] = get_number_in_system(path, "numbers/symbols/nativeZeroDigit", numbering_system)
-    result['minus'] = get_number_in_system(path, "numbers/symbols/minusSign", numbering_system)
-    result['plus'] = get_number_in_system(path, "numbers/symbols/plusSign", numbering_system)
-    result['exp'] = get_number_in_system(path, "numbers/symbols/exponential", numbering_system)
-    result['quotationStart'] = findEntry(path, "delimiters/quotationStart")
-    result['quotationEnd'] = findEntry(path, "delimiters/quotationEnd")
-    result['alternateQuotationStart'] = findEntry(path, "delimiters/alternateQuotationStart")
-    result['alternateQuotationEnd'] = findEntry(path, "delimiters/alternateQuotationEnd")
-    result['listPatternPartStart'] = parse_list_pattern_part_format(findEntry(path, "listPatterns/listPattern/listPatternPart[start]"))
-    result['listPatternPartMiddle'] = parse_list_pattern_part_format(findEntry(path, "listPatterns/listPattern/listPatternPart[middle]"))
-    result['listPatternPartEnd'] = parse_list_pattern_part_format(findEntry(path, "listPatterns/listPattern/listPatternPart[end]"))
-    result['listPatternPartTwo'] = parse_list_pattern_part_format(findEntry(path, "listPatterns/listPattern/listPatternPart[2]"))
-    result['am'] = findEntry(path, "dates/calendars/calendar[gregorian]/dayPeriods/dayPeriodContext[format]/dayPeriodWidth[wide]/dayPeriod[am]", draft)
-    result['pm'] = findEntry(path, "dates/calendars/calendar[gregorian]/dayPeriods/dayPeriodContext[format]/dayPeriodWidth[wide]/dayPeriod[pm]", draft)
-    result['longDateFormat'] = convert_date(findEntry(path, "dates/calendars/calendar[gregorian]/dateFormats/dateFormatLength[full]/dateFormat/pattern"))
-    result['shortDateFormat'] = convert_date(findEntry(path, "dates/calendars/calendar[gregorian]/dateFormats/dateFormatLength[short]/dateFormat/pattern"))
-    result['longTimeFormat'] = convert_date(findEntry(path, "dates/calendars/calendar[gregorian]/timeFormats/timeFormatLength[full]/timeFormat/pattern"))
-    result['shortTimeFormat'] = convert_date(findEntry(path, "dates/calendars/calendar[gregorian]/timeFormats/timeFormatLength[short]/timeFormat/pattern"))
-
-    endonym = None
-    if country_code and script_code:
-        endonym = findEntryDef(path, "localeDisplayNames/languages/language[type=%s_%s_%s]" % (language_code, script_code, country_code))
-    if not endonym and script_code:
-        endonym = findEntryDef(path, "localeDisplayNames/languages/language[type=%s_%s]" % (language_code, script_code))
-    if not endonym and country_code:
-        endonym = findEntryDef(path, "localeDisplayNames/languages/language[type=%s_%s]" % (language_code, country_code))
-    if not endonym:
-        endonym = findEntryDef(path, "localeDisplayNames/languages/language[type=%s]" % (language_code))
-    result['languageEndonym'] = endonym
-    result['countryEndonym'] = findEntryDef(path, "localeDisplayNames/territories/territory[type=%s]" % (country_code))
-
-    currency_format = get_number_in_system(path, "numbers/currencyFormats/currencyFormatLength/currencyFormat/pattern", numbering_system)
-    currency_format = parse_number_format(currency_format, result)
-    result['currencyFormat'] = currency_format[0]
-    result['currencyNegativeFormat'] = ''
-    if len(currency_format) > 1:
-        result['currencyNegativeFormat'] = currency_format[1]
-
-    result['currencySymbol'] = ''
-    result['currencyDisplayName'] = ''
-    if result['currencyIsoCode']:
-        stem = "numbers/currencies/currency[%s]/" % result['currencyIsoCode']
-        result['currencySymbol'] = findEntryDef(path, stem + 'symbol')
-        displays = tuple(findEntryDef(path, stem + 'displayName' + tail)
-                         for tail in ('',) + tuple(
-                             '[count=%s]' % x for x in ('zero', 'one', 'two',
-                                                        'few', 'many', 'other')))
-        while displays and not displays[-1]:
-            displays = displays[:-1]
-        result['currencyDisplayName'] = ';'.join(displays)
-
-    def findUnitDef(path, stem, fallback=''):
-        # The displayName for a quantified unit in en.xml is kByte
-        # instead of kB (etc.), so prefer any unitPattern provided:
-        for count in ('many', 'few', 'two', 'other', 'zero', 'one'):
-            try:
-                ans = findEntry(path, stem + 'unitPattern[count=%s]' % count)
-            except xpathlite.Error:
-                continue
-
-            # TODO: epxloit count-handling, instead of discarding placeholders
-            if ans.startswith('{0}'):
-                ans = ans[3:].lstrip()
-            if ans:
-                return ans
-
-        return findEntryDef(path, stem + 'displayName', fallback)
-
-    # First without quantifier, then quantified each way:
-    result['byte_unit'] = findEntryDef(
-        path, 'units/unitLength[type=long]/unit[type=digital-byte]/displayName',
-        'bytes')
-    stem = 'units/unitLength[type=short]/unit[type=digital-%sbyte]/'
-    known = [] # cases where we *do* have a given version:
-    result['byte_si_quantified'] = ';'.join(unit_quantifiers(findUnitDef, path, stem, 'B', known))
-    # IEC 60027-2
-    # http://physics.nist.gov/cuu/Units/binary.html
-    result['byte_iec_quantified'] = ';'.join(unit_quantifiers(findUnitDef, path, stem % '%sbi', 'iB', known))
-
-    # Used for month and day data:
-    namings = (
-        ('standaloneLong', 'stand-alone', 'wide'),
-        ('standaloneShort', 'stand-alone', 'abbreviated'),
-        ('standaloneNarrow', 'stand-alone', 'narrow'),
-        ('long', 'format', 'wide'),
-        ('short', 'format', 'abbreviated'),
-        ('narrow', 'format', 'narrow'),
-        )
-
-    # Month names for 12-month calendars:
-    for cal in calendars:
-        stem = 'dates/calendars/calendar[' + cal + ']/months/'
-        for (key, mode, size) in namings:
-            prop = 'monthContext[' + mode + ']/monthWidth[' + size + ']/'
-            result[key + 'Months_' + cal] = ';'.join(
-                findEntry(path, stem + prop + "month[%d]" % i)
-                for i in range(1, 13))
-
-    # Day data (for Gregorian, at least):
-    stem = 'dates/calendars/calendar[gregorian]/days/'
-    days = ('sun', 'mon', 'tue', 'wed', 'thu', 'fri', 'sat')
-    for (key, mode, size) in namings:
-        prop = 'dayContext[' + mode + ']/dayWidth[' + size + ']/day'
-        result[key + 'Days'] = ';'.join(
-            findEntry(path, stem + prop + '[' + day + ']')
-            for day in days)
-
-    return Locale(result)
-
-def addEscapes(s):
-    result = ''
-    for c in s:
-        n = ord(c)
-        if n < 128:
-            result += c
-        else:
-            result += "\\x"
-            result += "%02x" % (n)
-    return result
-
-def unicodeStr(s):
-    utf8 = s.encode('utf-8')
-    return "<size>" + str(len(utf8)) + "</size><data>" + addEscapes(utf8) + "</data>"
-
-def usage():
-    print "Usage: cldr2qlocalexml.py <path-to-cldr-main>"
-    sys.exit()
-
-def integrateWeekData(filePath):
-    if not filePath.endswith(".xml"):
-        return {}
-
-    def lookup(key):
-        return findEntryInFile(filePath, key, attribute='territories')[0].split()
-    days = ('mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun')
-
-    firstDayByCountryCode = {}
-    for day in days:
-        for countryCode in lookup('weekData/firstDay[day=%s]' % day):
-            firstDayByCountryCode[countryCode] = day
-
-    weekendStartByCountryCode = {}
-    for day in days:
-        for countryCode in lookup('weekData/weekendStart[day=%s]' % day):
-            weekendStartByCountryCode[countryCode] = day
-
-    weekendEndByCountryCode = {}
-    for day in days:
-        for countryCode in lookup('weekData/weekendEnd[day=%s]' % day):
-            weekendEndByCountryCode[countryCode] = day
-
-    for (key, locale) in locale_database.iteritems():
-        countryCode = locale.country_code
-        if countryCode in firstDayByCountryCode:
-            locale.firstDayOfWeek = firstDayByCountryCode[countryCode]
-        else:
-            locale.firstDayOfWeek = firstDayByCountryCode["001"]
-
-        if countryCode in weekendStartByCountryCode:
-            locale.weekendStart = weekendStartByCountryCode[countryCode]
-        else:
-            locale.weekendStart = weekendStartByCountryCode["001"]
-
-        if countryCode in weekendEndByCountryCode:
-            locale.weekendEnd = weekendEndByCountryCode[countryCode]
-        else:
-            locale.weekendEnd = weekendEndByCountryCode["001"]
-
-def splitLocale(name):
-    """Split name into (language, script, territory) triple as generator.
-
-    Ignores any trailing fields (with a warning), leaves script (a capitalised
-    four-letter token) or territory (either a number or an all-uppercase token)
-    empty if unspecified, returns a single-entry generator if name is a single
-    tag (i.e. contains no underscores).  Always yields 1 or 3 values, never 2."""
-    tags = iter(name.split('_'))
-    yield tags.next() # Language
-    tag = tags.next()
-
-    # Script is always four letters, always capitalised:
-    if len(tag) == 4 and tag[0].isupper() and tag[1:].islower():
-        yield tag
-        try:
-            tag = tags.next()
-        except StopIteration:
-            tag = ''
-    else:
-        yield ''
-
-    # Territory is upper-case or numeric:
-    if tag and tag.isupper() or tag.isdigit():
-        yield tag
-        tag = ''
+from localetools import Error
+from cldr import CldrReader
+from qlocalexml import QLocaleXmlWriter
+from enumdata import language_list, script_list, country_list
+
+def usage(name, err, message = ''):
+    err.write("""Usage: {} path/to/cldr/common/main [out-file.xml]
+""".format(name)) # TODO: expand command-line, improve help message
+    if message:
+        err.write('\n' + message + '\n')
+
+def main(args, out, err):
+    # TODO: make calendars a command-line option
+    calendars = ['gregorian', 'persian', 'islamic'] # 'hebrew'
+
+    # TODO: make argument parsing more sophisticated
+    name = args.pop(0)
+    if not args:
+        usage(name, err, 'Where is your CLDR data tree ?')
+        return 1
+
+    root = args.pop(0)
+    if not os.path.exists(os.path.join(root, 'common', 'main', 'root.xml')):
+        usage(name, err,
+              'First argument is the root of the CLDR tree: found no common/main/root.xml under '
+              + root)
+        return 1
+
+    xml = args.pop(0) if args else None
+    if not xml or xml == '-':
+        emit = out
+    elif not xml.endswith('.xml'):
+        usage(name, err, 'Please use a .xml extension on your output file name, not ' + xml)
+        return 1
     else:
-        yield ''
-
-    # If nothing is left, StopIteration will avoid the warning:
-    tag = (tag if tag else tags.next(),)
-    sys.stderr.write('Ignoring unparsed cruft %s in %s\n' % ('_'.join(tag + tuple(tags)), name))
-
-if len(sys.argv) != 2:
-    usage()
-
-cldr_dir = sys.argv[1]
-
-if not os.path.isdir(cldr_dir):
-    usage()
-
-cldr_files = os.listdir(cldr_dir)
-
-locale_database = {}
-
-# see http://www.unicode.org/reports/tr35/tr35-info.html#Default_Content
-defaultContent_locales = []
-for ns in findTagsInFile(os.path.join(cldr_dir, '..', 'supplemental',
-                                      'supplementalMetadata.xml'),
-                         'metadata/defaultContent'):
-    for data in ns[1:][0]:
-        if data[0] == u"locales":
-            defaultContent_locales += data[1].split()
-
-skips = []
-for file in defaultContent_locales:
-    try:
-        language_code, script_code, country_code = splitLocale(file)
-    except ValueError:
-        sys.stderr.write('skipping defaultContent locale "' + file + '" [neither two nor three tags]\n')
-        continue
-
-    if not (script_code or country_code):
-        sys.stderr.write('skipping defaultContent locale "' + file + '" [second tag is neither script nor territory]\n')
-        continue
-
-    try:
-        l = _generateLocaleInfo(cldr_dir + "/" + file + ".xml", language_code, script_code, country_code)
-        if not l:
-            skips.append(file)
-            continue
-    except xpathlite.Error as e:
-        sys.stderr.write('skipping defaultContent locale "%s" (%s)\n' % (file, str(e)))
-        continue
-
-    locale_database[(l.language_id, l.script_id, l.country_id, l.variant_code)] = l
-
-if skips:
-    wrappedwarn('skipping defaultContent locales [no locale info generated]: ', skips)
-    skips = []
-
-for file in cldr_files:
-    try:
-        l = generateLocaleInfo(cldr_dir + "/" + file)
-        if not l:
-            skips.append(file)
-            continue
-    except xpathlite.Error as e:
-        sys.stderr.write('skipping file "%s" (%s)\n' % (file, str(e)))
-        continue
-
-    locale_database[(l.language_id, l.script_id, l.country_id, l.variant_code)] = l
-
-if skips:
-    wrappedwarn('skipping files [no locale info generated]: ', skips)
-
-integrateWeekData(cldr_dir+"/../supplemental/supplementalData.xml")
-locale_keys = locale_database.keys()
-locale_keys.sort()
-
-cldr_version = 'unknown'
-ldml = open(cldr_dir+"/../dtd/ldml.dtd", "r")
-for line in ldml:
-    if 'version cldrVersion CDATA #FIXED' in line:
-        cldr_version = line.split('"')[1]
-
-if sys.stdout.encoding != 'UTF-8' or (sys.stdout.encoding is None and sys.getdefaultencoding() != 'UTF-8'):
-    reload(sys) # Weirdly, this gets a richer sys module than the plain import got us !
-    sys.setdefaultencoding('UTF-8')
-
-print "<localeDatabase>"
-print "    <version>" + cldr_version + "</version>"
-print "    <languageList>"
-for id in enumdata.language_list:
-    l = enumdata.language_list[id]
-    print "        <language>"
-    print "            <name>" + l[0] + "</name>"
-    print "            <id>" + str(id) + "</id>"
-    print "            <code>" + l[1] + "</code>"
-    print "        </language>"
-print "    </languageList>"
-
-print "    <scriptList>"
-for id in enumdata.script_list:
-    l = enumdata.script_list[id]
-    print "        <script>"
-    print "            <name>" + l[0] + "</name>"
-    print "            <id>" + str(id) + "</id>"
-    print "            <code>" + l[1] + "</code>"
-    print "        </script>"
-print "    </scriptList>"
-
-print "    <countryList>"
-for id in enumdata.country_list:
-    l = enumdata.country_list[id]
-    print "        <country>"
-    print "            <name>" + l[0] + "</name>"
-    print "            <id>" + str(id) + "</id>"
-    print "            <code>" + l[1] + "</code>"
-    print "        </country>"
-print "    </countryList>"
-
-def _parseLocale(l):
-    language = "AnyLanguage"
-    script = "AnyScript"
-    country = "AnyCountry"
-
-    if l == "und":
-        raise xpathlite.Error("we are treating unknown locale like C")
-
-    parsed = splitLocale(l)
-    language_code = parsed.next()
-    script_code = country_code = ''
-    try:
-        script_code, country_code = parsed
-    except ValueError:
-        pass
-
-    if language_code != "und":
-        language_id = enumdata.languageCodeToId(language_code)
-        if language_id == -1:
-            raise xpathlite.Error('unknown language code "%s"' % language_code)
-        language = enumdata.language_list[language_id][0]
-
-    if script_code:
-        script_id = enumdata.scriptCodeToId(script_code)
-        if script_id == -1:
-            raise xpathlite.Error('unknown script code "%s"' % script_code)
-        script = enumdata.script_list[script_id][0]
-
-    if country_code:
-        country_id = enumdata.countryCodeToId(country_code)
-        if country_id == -1:
-            raise xpathlite.Error('unknown country code "%s"' % country_code)
-        country = enumdata.country_list[country_id][0]
+        try:
+            emit = open(xml, 'w')
+        except IOError as e:
+            usage(name, err, 'Failed to open "{}" to write output to it\n'.format(xml))
+            return 1
 
-    return (language, script, country)
+    if args:
+        usage(name, err, 'Too many arguments - excess: ' + ' '.join(args))
+        return 1
 
-skips = []
-print "    <likelySubtags>"
-for ns in findTagsInFile(cldr_dir + "/../supplemental/likelySubtags.xml", "likelySubtags"):
-    tmp = {}
-    for data in ns[1:][0]: # ns looks like this: [u'likelySubtag', [(u'from', u'aa'), (u'to', u'aa_Latn_ET')]]
-        tmp[data[0]] = data[1]
+    if emit.encoding != 'UTF-8' or (emit.encoding is None and sys.getdefaultencoding() != 'UTF-8'):
+        reload(sys) # Weirdly, this gets a richer sys module than the plain import got us !
+        sys.setdefaultencoding('UTF-8')
 
-    try:
-        from_language, from_script, from_country = _parseLocale(tmp[u"from"])
-        to_language, to_script, to_country = _parseLocale(tmp[u"to"])
-    except xpathlite.Error as e:
-        if tmp[u'to'].startswith(tmp[u'from']) and str(e) == 'unknown language code "%s"' % tmp[u'from']:
-            skips.append(tmp[u'to'])
-        else:
-            sys.stderr.write('skipping likelySubtag "%s" -> "%s" (%s)\n' % (tmp[u"from"], tmp[u"to"], str(e)))
-        continue
-    # substitute according to http://www.unicode.org/reports/tr35/#Likely_Subtags
-    if to_country == "AnyCountry" and from_country != to_country:
-        to_country = from_country
-    if to_script == "AnyScript" and from_script != to_script:
-        to_script = from_script
+    # TODO - command line options to tune choice of grumble and whitter:
+    reader = CldrReader(root, err.write, err.write)
+    writer = QLocaleXmlWriter(emit.write)
 
-    print "        <likelySubtag>"
-    print "            <from>"
-    print "                <language>" + from_language + "</language>"
-    print "                <script>" + from_script + "</script>"
-    print "                <country>" + from_country + "</country>"
-    print "            </from>"
-    print "            <to>"
-    print "                <language>" + to_language + "</language>"
-    print "                <script>" + to_script + "</script>"
-    print "                <country>" + to_country + "</country>"
-    print "            </to>"
-    print "        </likelySubtag>"
-print "    </likelySubtags>"
-if skips:
-    wrappedwarn('skipping likelySubtags (for unknown language codes): ', skips)
-print "    <localeList>"
+    writer.version(reader.root.cldrVersion)
+    writer.enumData(language_list, script_list, country_list)
+    writer.likelySubTags(reader.likelySubTags())
+    writer.locales(reader.readLocales(calendars), calendars)
 
-Locale.C(calendars).toXml(calendars)
-for key in locale_keys:
-    locale_database[key].toXml(calendars)
+    writer.close()
+    return 0
 
-print "    </localeList>"
-print "</localeDatabase>"
+if __name__ == '__main__':
+    sys.exit(main(sys.argv, sys.stdout, sys.stderr))
diff --git a/util/locale_database/cldr2qtimezone.py b/util/locale_database/cldr2qtimezone.py
index 4c3609056d..70b5d1e69e 100755
--- a/util/locale_database/cldr2qtimezone.py
+++ b/util/locale_database/cldr2qtimezone.py
@@ -1,7 +1,7 @@
 #!/usr/bin/env python2
 #############################################################################
 ##
-## Copyright (C) 2019 The Qt Company Ltd.
+## Copyright (C) 2020 The Qt Company Ltd.
 ## Contact: https://www.qt.io/licensing/
 ##
 ## This file is part of the test suite of the Qt Toolkit.
@@ -34,59 +34,20 @@ the CLDR data.  Pass its common/ directory as first parameter to this
 script and the qtbase root directory as second parameter.  It shall
 update qtbase's src/corelib/time/qtimezoneprivate_data_p.h ready for
 use.
-
-The XML structure is as follows:
-
- <supplementalData>
-     <version number="$Revision:...$"/>
-     <generation date="$Date:...$"/>
-     <windowsZones>
-         <mapTimezones otherVersion="..." typeVersion="...">
-             <!-- (UTC-08:00) Pacific Time (US & Canada) -->
-             <mapZone other="Pacific Standard Time" territory="001" type="America/Los_Angeles"/>
-             <mapZone other="Pacific Standard Time" territory="CA" type="America/Vancouver America/Dawson America/Whitehorse"/>
-             <mapZone other="Pacific Standard Time" territory="US" type="America/Los_Angeles America/Metlakatla"/>
-             <mapZone other="Pacific Standard Time" territory="ZZ" type="PST8PDT"/>
-         </mapTimezones>
-     </windowsZones>
- </supplementalData>
 """
 
 import os
-import sys
-import datetime
-import tempfile
-import enumdata
-import xpathlite
-from  xpathlite import DraftResolution
 import re
-import qlocalexml2cpp
+import datetime
+import textwrap
 
-findAlias = xpathlite.findAlias
-findEntry = xpathlite.findEntry
-findEntryInFile = xpathlite._findEntryInFile
-findTagsInFile = xpathlite.findTagsInFile
-unicode2hex = qlocalexml2cpp.unicode2hex
-wrap_list = qlocalexml2cpp.wrap_list
+from localetools import unicode2hex, wrap_list, Error, SourceFileEditor
+from cldr import CldrAccess
 
-class ByteArrayData:
-    def __init__(self):
-        self.data = []
-        self.hash = {}
-    def append(self, s):
-        s = s + '\0'
-        if s in self.hash:
-            return self.hash[s]
+### Data that may need updates in response to new entries in the CLDR file ###
 
-        lst = unicode2hex(s)
-        index = len(self.data)
-        if index > 65535:
-            print "\n\n\n#error Data index is too big!"
-            sys.stderr.write ("\n\n\nERROR: index exceeds the uint16 range! index = %d\n" % index)
-            sys.exit(1)
-        self.hash[s] = index
-        self.data += lst
-        return index
+# This script shall report the update you need, if this arises.
+# However, you may need to research the relevant zone's standard offset.
 
 # List of currently known Windows IDs.
 # If this script reports missing IDs, please add them here.
@@ -233,12 +194,6 @@ windowsIdList = (
     (u'Yakutsk Standard Time',            32400),
 )
 
-def windowsIdToKey(windowsId):
-    for index, pair in enumerate(windowsIdList):
-        if pair[0] == windowsId:
-            return index + 1
-    return 0
-
 # List of standard UTC IDs to use.  Not public so may be safely changed.
 # Do not remove IDs, as each entry is part of the API/behavior guarantee.
 # ( UTC Id, Offset Seconds )
@@ -285,94 +240,43 @@ utcIdList = (
     (u'UTC+14:00',  50400),
 )
 
-def usage():
-    print "Usage: cldr2qtimezone.py <path to cldr core/common> <path to qtbase>"
-    sys.exit()
-
-if len(sys.argv) != 3:
-    usage()
-
-cldrPath = sys.argv[1]
-qtPath = sys.argv[2]
-
-if not os.path.isdir(cldrPath) or not os.path.isdir(qtPath):
-    usage()
-
-windowsZonesPath = cldrPath + "/supplemental/windowsZones.xml"
-tempFileDir = qtPath
-dataFilePath = qtPath + "/src/corelib/time/qtimezoneprivate_data_p.h"
-
-if not (os.path.isfile(windowsZonesPath) and os.path.isfile(dataFilePath)):
-    usage()
-
-cldr_version = 'unknown'
-ldml = open(cldrPath + "/dtd/ldml.dtd", "r")
-for line in ldml:
-    if 'version cldrVersion CDATA #FIXED' in line:
-        cldr_version = line.split('"')[1]
-
-# [[u'version', [(u'number', u'$Revision: 7825 $')]]]
-versionNumber = findTagsInFile(windowsZonesPath, "version")[0][1][0][1]
-
-mapTimezones = findTagsInFile(windowsZonesPath, "windowsZones/mapTimezones")
-
-defaultDict = {}
-windowsIdDict = {}
-
-if mapTimezones:
-    badZones = set()
-    for mapZone in mapTimezones:
-        # [u'mapZone', [(u'territory', u'MH'), (u'other', u'UTC+12'), (u'type', u'Pacific/Majuro Pacific/Kwajalein')]]
-        if mapZone[0] == u'mapZone':
-            data = {}
-            for attribute in mapZone[1]:
-                if attribute[0] == u'other':
-                    data['windowsId'] = attribute[1]
-                if attribute[0] == u'territory':
-                    data['countryCode'] = attribute[1]
-                if attribute[0] == u'type':
-                    data['ianaList'] = attribute[1]
-
-            data['windowsKey'] = windowsIdToKey(data['windowsId'])
-            if data['windowsKey'] <= 0:
-                badZones.add(data['windowsId'])
-
-            countryId = 0
-            if data['countryCode'] == u'001':
-                defaultDict[data['windowsKey']] = data['ianaList']
-            else:
-                data['countryId'] = enumdata.countryCodeToId(data['countryCode'])
-                if data['countryId'] < 0:
-                    raise xpathlite.Error("Unknown Country Code \"%s\"" % data['countryCode'])
-                data['country'] = enumdata.country_list[data['countryId']][0]
-                windowsIdDict[data['windowsKey'], data['countryId']] = data
-    if badZones:
-        sys.stderr.write('\n\t'.join(["\nUnknown Windows ID, please add:"] + sorted(badZones))
-                         + "\nto the windowIdList in cldr2qtimezone.py\n\n")
-        raise xpathlite.Error("Unknown Windows IDs")
-
-print "Input file parsed, now writing data"
-
-GENERATED_BLOCK_START = "// GENERATED PART STARTS HERE\n"
-GENERATED_BLOCK_END = "// GENERATED PART ENDS HERE\n"
-
-# Create a temp file to write the new data into
-(newTempFile, newTempFilePath) = tempfile.mkstemp("qtimezone_data_p", dir=tempFileDir)
-newTempFile = os.fdopen(newTempFile, "w")
-
-# Open the old file and copy over the first non-generated section to the new file
-oldDataFile = open(dataFilePath, "r")
-s = oldDataFile.readline()
-while s and s != GENERATED_BLOCK_START:
-    newTempFile.write(s)
-    s = oldDataFile.readline()
-
-# Write out generated block start tag and warning
-newTempFile.write(GENERATED_BLOCK_START)
-newTempFile.write("""
+### End of data that may need updates in response to CLDR ###
+
+class ByteArrayData:
+    def __init__(self):
+        self.data = []
+        self.hash = {}
+
+    def append(self, s):
+        s = s + '\0'
+        if s in self.hash:
+            return self.hash[s]
+
+        lst = unicode2hex(s)
+        index = len(self.data)
+        if index > 0xffff:
+            raise Error('Index ({}) outside the uint16 range !'.format(index))
+        self.hash[s] = index
+        self.data += lst
+        return index
+
+    def write(self, out, name):
+        out('\nstatic const char {}[] = {{\n'.format(name))
+        out(wrap_list(self.data))
+        out('\n};\n')
+
+class ZoneIdWriter (SourceFileEditor):
+    def write(self, version, defaults, windowsIds):
+        self.__writeWarning(version)
+        windows, iana = self.__writeTables(self.writer.write, defaults, windowsIds)
+        windows.write(self.writer.write, 'windowsIdData')
+        iana.write(self.writer.write, 'ianaIdData')
+
+    def __writeWarning(self, version):
+        self.writer.write("""
 /*
-    This part of the file was generated on %s from the
-    Common Locale Data Repository v%s supplemental/windowsZones.xml file %s
+    This part of the file was generated on {} from the
+    Common Locale Data Repository v{} file supplemental/windowsZones.xml
 
     http://www.unicode.org/cldr/
 
@@ -380,80 +284,111 @@ newTempFile.write("""
     edited) CLDR data; see qtbase/util/locale_database/.
 */
 
-""" % (str(datetime.date.today()), cldr_version, versionNumber) )
-
-windowsIdData = ByteArrayData()
-ianaIdData = ByteArrayData()
-
-# Write Windows/IANA table
-newTempFile.write("// Windows ID Key, Country Enum, IANA ID Index\n")
-newTempFile.write("static const QZoneData zoneDataTable[] = {\n")
-for index in sorted(windowsIdDict):
-    data = windowsIdDict[index]
-    newTempFile.write("    { %6d,%6d,%6d }, // %s / %s\n"
-                         % (data['windowsKey'],
-                            data['countryId'],
-                            ianaIdData.append(data['ianaList']),
-                            data['windowsId'],
-                            data['country']))
-newTempFile.write("    {      0,     0,     0 } // Trailing zeroes\n")
-newTempFile.write("};\n\n")
-
-print "Done Zone Data"
-
-# Write Windows ID key table
-newTempFile.write("// Windows ID Key, Windows ID Index, IANA ID Index, UTC Offset\n")
-newTempFile.write("static const QWindowsData windowsDataTable[] = {\n")
-for index, pair in enumerate(windowsIdList):
-    newTempFile.write("    { %6d,%6d,%6d,%6d }, // %s\n"
-                      % (index + 1, windowsIdData.append(pair[0]),
-                         ianaIdData.append(defaultDict[index + 1]), pair[1], pair[0]))
-newTempFile.write("    {      0,     0,     0,     0 } // Trailing zeroes\n")
-newTempFile.write("};\n\n")
-
-print "Done Windows Data Table"
-
-# Write UTC ID key table
-newTempFile.write("// IANA ID Index, UTC Offset\n")
-newTempFile.write("static const QUtcData utcDataTable[] = {\n")
-for pair in utcIdList:
-    newTempFile.write("    { %6d,%6d }, // %s\n"
-                      % (ianaIdData.append(pair[0]), pair[1], pair[0]))
-newTempFile.write("    {     0,      0 } // Trailing zeroes\n")
-newTempFile.write("};\n\n")
-
-print "Done UTC Data Table"
-
-# Write out Windows ID's data
-newTempFile.write("static const char windowsIdData[] = {\n")
-newTempFile.write(wrap_list(windowsIdData.data))
-newTempFile.write("\n};\n\n")
-
-# Write out IANA ID's data
-newTempFile.write("static const char ianaIdData[] = {\n")
-newTempFile.write(wrap_list(ianaIdData.data))
-newTempFile.write("\n};\n")
-
-print "Done ID Data Table"
-
-# Write out the end of generated block tag
-newTempFile.write(GENERATED_BLOCK_END)
-s = oldDataFile.readline()
-
-# Skip through the old generated data in the old file
-while s and s != GENERATED_BLOCK_END:
-    s = oldDataFile.readline()
-
-# Now copy the rest of the original file into the new file
-s = oldDataFile.readline()
-while s:
-    newTempFile.write(s)
-    s = oldDataFile.readline()
-
-# Now close the old and new file, delete the old file and copy the new file in its place
-newTempFile.close()
-oldDataFile.close()
-os.remove(dataFilePath)
-os.rename(newTempFilePath, dataFilePath)
-
-print "Data generation completed, please check the new file at " + dataFilePath
+""".format(str(datetime.date.today()), version))
+
+    @staticmethod
+    def __writeTables(out, defaults, windowsIds):
+        windowsIdData, ianaIdData = ByteArrayData(), ByteArrayData()
+
+        # Write Windows/IANA table
+        out('// Windows ID Key, Country Enum, IANA ID Index\n')
+        out('static const QZoneData zoneDataTable[] = {\n')
+        for index, data in sorted(windowsIds.items()):
+            out('    {{ {:6d},{:6d},{:6d} }}, // {} / {}\n'.format(
+                    data['windowsKey'], data['countryId'],
+                    ianaIdData.append(data['ianaList']),
+                    data['windowsId'], data['country']))
+        out('    {      0,     0,     0 } // Trailing zeroes\n')
+        out('};\n\n')
+
+        # Write Windows ID key table
+        out('// Windows ID Key, Windows ID Index, IANA ID Index, UTC Offset\n')
+        out('static const QWindowsData windowsDataTable[] = {\n')
+        for index, pair in enumerate(windowsIdList, 1):
+            out('    {{ {:6d},{:6d},{:6d},{:6d} }}, // {}\n'.format(
+                    index,
+                    windowsIdData.append(pair[0]),
+                    ianaIdData.append(defaults[index]),
+                    pair[1], pair[0]))
+        out('    {      0,     0,     0,     0 } // Trailing zeroes\n')
+        out('};\n\n')
+
+        # Write UTC ID key table
+        out('// IANA ID Index, UTC Offset\n')
+        out('static const QUtcData utcDataTable[] = {\n')
+        for pair in utcIdList:
+            out('    {{ {:6d},{:6d} }}, // {}\n'.format(
+                    ianaIdData.append(pair[0]), pair[1], pair[0]))
+        out('    {     0,      0 } // Trailing zeroes\n')
+        out('};\n')
+
+        return windowsIdData, ianaIdData
+
+def usage(err, name, message=''):
+    err.write("""Usage: {} path/to/cldr/core/common path/to/qtbase
+""".format(name)) # TODO: more interesting message
+    if message:
+        err.write('\n' + message + '\n')
+
+def main(args, out, err):
+    """Parses CLDR's data and updates Qt's representation of it.
+
+    Takes sys.argv, sys.stdout, sys.stderr (or equivalents) as
+    arguments. Expects two command-line options: the root of the
+    unpacked CLDR data-file tree and the root of the qtbase module's
+    checkout. Updates QTimeZone's private data about Windows time-zone
+    IDs."""
+    name = args.pop(0)
+    if len(args) != 2:
+        usage(err, name, "Expected two arguments")
+        return 1
+
+    cldrPath = args.pop(0)
+    qtPath = args.pop(0)
+
+    if not os.path.isdir(qtPath):
+        usage(err, name, "No such Qt directory: " + qtPath)
+        return 1
+    if not os.path.isdir(cldrPath):
+        usage(err, name, "No such CLDR directory: " + cldrPath)
+        return 1
+
+    dataFilePath = os.path.join(qtPath, 'src', 'corelib', 'time', 'qtimezoneprivate_data_p.h')
+    if not os.path.isfile(dataFilePath):
+        usage(err, name, 'No such file: ' + dataFilePath)
+        return 1
+
+    try:
+        version, defaults, winIds = CldrAccess(cldrPath).readWindowsTimeZones(
+            dict((name, ind) for ind, name in enumerate((x[0] for x in windowsIdList), 1)))
+    except IOError as e:
+        usage(err, name,
+              'Failed to open common/supplemental/windowsZones.xml: ' + (e.message or e.args[1]))
+        return 1
+    except Error as e:
+        err.write('\n'.join(textwrap.wrap(
+                    'Failed to read windowsZones.xml: ' + (e.message or e.args[1]),
+                    subsequent_indent=' ', width=80)) + '\n')
+        return 1
+
+    out.write('Input file parsed, now writing data\n')
+    try:
+        writer = ZoneIdWriter(dataFilePath, qtPath)
+    except IOError as e:
+        err.write('Failed to open files to transcribe: {}'.format(e.message or e.args[1]))
+        return 1
+
+    try:
+        writer.write(version, defaults, winIds)
+    except Error as e:
+        writer.cleanup()
+        err.write('\nError in Windows ID data: ' + e.message + '\n')
+        return 1
+
+    writer.close()
+    out.write('Data generation completed, please check the new file at ' + dataFilePath + '\n')
+    return 0
+
+if __name__ == '__main__':
+    import sys
+    sys.exit(main(sys.argv, sys.stdout, sys.stderr))
diff --git a/util/locale_database/ldml.py b/util/locale_database/ldml.py
new file mode 100644
index 0000000000..e3e3a2e4ba
--- /dev/null
+++ b/util/locale_database/ldml.py
@@ -0,0 +1,589 @@
+#############################################################################
+##
+## Copyright (C) 2020 The Qt Company Ltd.
+## Contact: https://www.qt.io/licensing/
+##
+## This file is part of the test suite of the Qt Toolkit.
+##
+## $QT_BEGIN_LICENSE:GPL-EXCEPT$
+## Commercial License Usage
+## Licensees holding valid commercial Qt licenses may use this file in
+## accordance with the commercial license agreement provided with the
+## Software or, alternatively, in accordance with the terms contained in
+## a written agreement between you and The Qt Company. For licensing terms
+## and conditions see https://www.qt.io/terms-conditions. For further
+## information use the contact form at https://www.qt.io/contact-us.
+##
+## GNU General Public License Usage
+## Alternatively, this file may be used under the terms of the GNU
+## General Public License version 3 as published by the Free Software
+## Foundation with exceptions as appearing in the file LICENSE.GPL3-EXCEPT
+## included in the packaging of this file. Please review the following
+## information to ensure the GNU General Public License requirements will
+## be met: https://www.gnu.org/licenses/gpl-3.0.html.
+##
+## $QT_END_LICENSE$
+##
+#############################################################################
+"""Parsing the Locale Data Markup Language
+
+It's an XML format, so the raw parsing of XML is, of course, delegated
+to xml.dom.minidom; but it has its own specific schemata and some
+funky rules for combining data from various files (inheritance between
+locales). The use of it we're interested in is extraction of CLDR's
+data, so some of the material here is specific to CLDR; see cldr.py
+for how it is mainly used.
+
+Provides various classes to wrap xml.dom's objects, specifically those
+returned by minidom.parse() and their child-nodes:
+  Node -- wraps any node in the DOM tree
+  XmlScanner -- wraps the root element of a stand-alone XML file
+  Supplement -- specializes XmlScanner for supplemental data files
+  LocaleScanner -- wraps a locale's inheritance-chain of file roots
+
+See individual classes for further detail.
+"""
+from localetools import Error
+from dateconverter import convert_date
+
+class Node (object):
+    """Wrapper for an arbitrary DOM node.
+
+    Provides various ways to select chldren of a node. Selected child
+    nodes are returned wrapped as Node objects.  A Node exposes the
+    raw DOM node it wraps via its .dom attribute."""
+
+    def __init__(self, elt, dullAttrs = None, draft = 0):
+        """Wraps a DOM node for ease of access.
+
+        First argument, elt, is the DOM node to wrap.
+
+        Optional second argument, dullAttrs, should either be None or
+        map each LDML tag name to a list of the names of
+        non-distinguishing attributes for nodes with the given tag
+        name. If None is given, no distinguishing attribute checks are
+        performed.
+
+        (Optional third argument, draft, should only be supplied by
+        this class's creation of child nodes; it is the maximum draft
+        score of any ancestor of the new node.)"""
+        self.dom, self.__dull = elt, dullAttrs
+        try:
+            attr = elt.attributes['draft'].nodeValue
+        except KeyError:
+            self.draft = draft
+        else:
+            self.draft = max(draft, self.draftScore(attr))
+
+    def findAllChildren(self, tag, wanted = None, allDull = False):
+        """All children that do have the given tag and attributes.
+
+        First argument is the tag: children with any other tag are
+        ignored.
+
+        Optional second argument, wanted, should either be None or map
+        attribute names to the values they must have. Only child nodes
+        with thes attributes set to the given values are yielded.
+
+        By default, nodes that have distinguishing attributes, other
+        than those specified in wanted, are ignored.  Pass the allDull
+        parameter a true value to suppress this check."""
+
+        if self.__dull is None:
+            allDull = True
+        dull = () if allDull else self.__dull[tag]
+
+        for child in self.dom.childNodes:
+            if child.nodeType != child.ELEMENT_NODE:
+                continue
+            if child.nodeName != tag:
+                continue
+
+            if wanted:
+                try:
+                    if any(child.attributes[k].nodeValue != v
+                           for k, v in wanted.items()):
+                        continue
+                except KeyError: # Some wanted attribute is missing
+                    continue
+
+                if not (allDull or all(k in dull or k in wanted
+                                       for k in child.attributes.keys())):
+                    continue
+
+            elif not (allDull or all(k in dull
+                                     for k in child.attributes.keys())):
+                continue
+
+            yield Node(child, self.__dull, self.draft)
+
+    def findUniqueChild(self, tag):
+        """Returns the single child with the given nodeName.
+
+        Raises Error if there is no such child or there is more than
+        one."""
+        seq = self.findAllChildren(tag)
+        try:
+            node = seq.next()
+        except StopIteration:
+            raise Error('No child found where one was expected', tag)
+        for it in seq:
+            raise Error('Many children found where only one was expected', tag)
+        return node
+
+    @classmethod
+    def draftScore(cls, level):
+        """Maps draft level names to numeric scores.
+
+        Single parameter, level, is the least sure value of the draft
+        attribute on a node that you're willing to accept; returns a
+        numeric value (lower is less drafty).
+
+        Tempting as it is to insist on low draft scores, there are
+        many locales in which pretty much every leaf is
+        unconfirmed. It may make sense to actually check each
+        XmlScanner object, or each node in each LocaleScanner's nodes
+        list, to see what its distribution of draft level looks like,
+        so as to set the acceptable draft score for its elements
+        accordingly. However, for the moment, we mostly just accept
+        all elements, regardless of draft values (the one exception is
+        am/pm indicators)."""
+        return cls.__draftScores.get(level, 5) if level else 0
+
+    # Implementation details:
+    __draftScores = dict(true = 4, unconfirmed = 3, provisional = 2,
+                         contributed = 1, approved = 0, false = 0)
+
+def _parseXPath(selector):
+    # Split "tag[attr=val][...]" into tag-name and attribute mapping
+    attrs = selector.split('[')
+    name = attrs.pop(0)
+    if attrs:
+        attrs = [x.strip() for x in attrs]
+        assert all(x.endswith(']') for x in attrs)
+        attrs = [x[:-1].split('=') for x in attrs]
+        assert all(len(x) in (1, 2) for x in attrs)
+        attrs = (('type', x[0]) if len(x) == 1 else x for x in attrs)
+    return name, dict(attrs)
+
+def _iterateEach(iters):
+    # Flatten a two-layer iterator.
+    for it in iters:
+        for item in it:
+            yield item
+
+class XmlScanner (object):
+    """Wrap an XML file to enable XPath access to its nodes.
+    """
+    def __init__(self, node):
+        self.root = node
+
+    def findNodes(self, xpath):
+        """Return all nodes under self.root matching this xpath.
+
+        Ignores any excess attributes."""
+        elts = (self.root,)
+        for selector in xpath.split('/'):
+            tag, attrs = _parseXPath(selector)
+            elts = tuple(_iterateEach(e.findAllChildren(tag, attrs) for e in elts))
+            if not elts:
+                break
+        return elts
+
+class Supplement (XmlScanner):
+    def find(self, xpath):
+        elts = self.findNodes(xpath)
+        for elt in _iterateEach(e.dom.childNodes if e.dom.childNodes else (e.dom,)
+                                for e in elts):
+            if elt.attributes:
+                yield (elt.nodeName,
+                       dict((k, v if isinstance(v, basestring) else v.nodeValue)
+                            for k, v in elt.attributes.items()))
+
+class LocaleScanner (object):
+    def __init__(self, name, nodes, root):
+        self.name, self.nodes, self.base = name, nodes, root
+
+    def find(self, xpath, default = None, draft = None):
+        """XPath search for the content of an element.
+
+        Required argument, xpath, is the XPath to search for. Optional
+        second argument is a default value to use, if no such node is
+        found.  Optional third argument is a draft score (see
+        Node.draftScore() for details); if given, leaf elements with
+        higher draft scores are ignored."""
+        try:
+            for elt in self.__find(xpath):
+                try:
+                    if draft is None or elt.draft <= draft:
+                        return elt.dom.firstChild.nodeValue
+                except (AttributeError, KeyError):
+                    pass
+        except Error as e:
+            if default is None:
+                raise
+            return default
+
+    def tagCodes(self):
+        """Yields four tag codes
+
+        The tag codes are language, script, country and variant; an
+        empty value for any of them indicates that no value was
+        provided.  The values are obtained from the primary file's
+        top-level <identity> element.  An Error is raised if any
+        top-level <alias> element of this file has a non-empty source
+        attribute; that attribute value is mentioned in the error's
+        message."""
+        root = self.nodes[0]
+        for alias in root.findAllChildren('alias', allDull=True):
+            try:
+                source = alias.dom.attributes['source'].nodeValue
+            except (KeyError, AttributeError):
+                pass
+            else:
+                raise Error('Alias to {}'.format(source))
+
+        ids = root.findUniqueChild('identity')
+        for code in ('language', 'script', 'territory', 'variant'):
+            for node in ids.findAllChildren(code, allDull=True):
+                try:
+                    yield node.dom.attributes['type'].nodeValue
+                except (KeyError, AttributeError):
+                    pass
+                else:
+                    break # only want one value for each code
+            else: # No value for this code, use empty
+                yield ''
+
+    def currencyData(self, isoCode):
+        """Fetches currency data for this locale.
+
+        Single argument, isoCode, is the ISO currency code for the
+        currency in use in the country. See also numericData, which
+        includes some currency formats.
+        """
+        if isoCode:
+            stem = 'numbers/currencies/currency[{}]/'.format(isoCode)
+            symbol = self.find(stem + 'symbol', '')
+            displays = tuple(self.find(stem + 'displayName' + tail, '')
+                for tail in ('',) + tuple(
+                    '[count={}]'.format(x) for x in ('zero', 'one', 'two',
+                                                     'few', 'many', 'other')))
+            while displays and not displays[-1]:
+                displays = displays[:-1]
+            name = ';'.join(displays)
+        else:
+            symbol = name = ''
+        yield 'currencySymbol', symbol
+        yield 'currencyDisplayName', name
+
+    def numericData(self, lookup, complain = lambda text: None):
+        """Generate assorted numeric data for the locale.
+
+        First argument, lookup, is a callable that maps a numbering
+        system's name to certain data about the system, as a mapping;
+        we expect this to have u'digits' as a key.
+        """
+        system = self.find('numbers/defaultNumberingSystem')
+        stem = 'numbers/symbols[numberSystem={}]/'.format(system)
+        decimal = self.find(stem + 'decimal')
+        group = self.find(stem + 'group')
+        assert decimal != group, (self.name, system, decimal)
+        yield 'decimal', decimal
+        yield 'group', group
+        yield 'percent', self.find(stem + 'percentSign')
+        yield 'list', self.find(stem + 'list')
+        yield 'exp', self.find(stem + 'exponential')
+
+        digits = lookup(system)['digits']
+        assert len(digits) == 10
+        zero = digits[0]
+        # Qt's number-formatting code assumes digits are consecutive:
+        assert all(ord(c) == i for i, c in enumerate(digits, ord(zero)))
+        yield 'zero', zero
+
+        plus = self.find(stem + 'plusSign')
+        minus = self.find(stem + 'minusSign')
+        yield 'plus', plus
+        yield 'minus', minus
+
+        # Currency formatting:
+        xpath = 'numbers/currencyFormats/currencyFormatLength/currencyFormat[accounting]/pattern'
+        try:
+            money = self.find(xpath.replace('Formats/',
+                                            'Formats[numberSystem={}]/'.format(system)))
+        except Error:
+            money = self.find(xpath)
+        money = self.__currencyFormats(money, plus, minus)
+        yield 'currencyFormat', money.next()
+        neg = ''
+        for it in money:
+            assert not neg, 'There should be at most one more pattern'
+            neg = it
+        yield 'currencyNegativeFormat', neg
+
+    def textPatternData(self):
+        for key in ('quotationStart', 'alternateQuotationEnd',
+                    'quotationEnd', 'alternateQuotationStart'):
+            yield key, self.find('delimiters/' + key)
+
+        for key in ('start', 'middle', 'end'):
+            yield ('listPatternPart' + key.capitalize(),
+                   self.__fromLdmlListPattern(self.find(
+                        'listPatterns/listPattern/listPatternPart[{}]'.format(key))))
+        yield ('listPatternPartTwo',
+               self.__fromLdmlListPattern(self.find(
+                    'listPatterns/listPattern/listPatternPart[2]')))
+
+        stem = 'dates/calendars/calendar[gregorian]/'
+        # TODO: is wide really the right width to use here ?
+        # abbreviated might be an option ... or try both ?
+        meridiem = stem + 'dayPeriods/dayPeriodContext[format]/dayPeriodWidth[wide]/'
+        for key in ('am', 'pm'):
+            yield key, self.find(meridiem + 'dayPeriod[{}]'.format(key),
+                                 draft = Node.draftScore('contributed'))
+
+        for pair in (('long', 'full'), ('short', 'short')):
+            for key in ('time', 'date'):
+                yield (pair[0] + key.capitalize() + 'Format',
+                       convert_date(self.find(
+                            stem + '{}Formats/{}FormatLength[{}]/{}Format/pattern'.format(
+                                key, key, pair[1], key))))
+
+    def endonyms(self, language, script, country, variant):
+        # TODO: take variant into account ?
+        for seq in ((language, script, country),
+                    (language, script), (language, country), (language,)):
+            if not all(seq):
+                continue
+            try:
+                yield ('languageEndonym',
+                       self.find('localeDisplayNames/languages/language[{}]'
+                                 .format('_'.join(seq))))
+            except Error:
+                pass
+            else:
+                break
+        else:
+            # grumble(failed to find endonym for language)
+            yield 'languageEndonym', ''
+
+        yield ('countryEndonym',
+               self.find('localeDisplayNames/territories/territory[{}]'
+                         .format(country), ''))
+
+    def unitData(self):
+        yield ('byte_unit',
+               self.find('units/unitLength[long]/unit[digital-byte]/displayName',
+                         'bytes'))
+
+        unit = self.__findUnit('', 'B')
+        cache = [] # Populated by the SI call, to give hints to the IEC call
+        yield ('byte_si_quantified',
+               ';'.join(self.__unitCount('', unit, cache)))
+        # IEC 60027-2
+        # http://physics.nist.gov/cuu/Units/binary.html
+        yield ('byte_iec_quantified',
+               ';'.join(self.__unitCount('bi', 'iB', cache)))
+
+    def calendarNames(self, calendars):
+        namings = self.__nameForms
+        for cal in calendars:
+            stem = 'dates/calendars/calendar[' + cal + ']/months/'
+            for key, mode, size in namings:
+                prop = 'monthContext[' + mode + ']/monthWidth[' + size + ']/'
+                yield (key + 'Months_' + cal,
+                       ';'.join(self.find(stem + prop + 'month[{}]'.format(i))
+                                for i in range(1, 13)))
+
+        # Day data (for Gregorian, at least):
+        stem = 'dates/calendars/calendar[gregorian]/days/'
+        days = ('sun', 'mon', 'tue', 'wed', 'thu', 'fri', 'sat')
+        for (key, mode, size) in namings:
+            prop = 'dayContext[' + mode + ']/dayWidth[' + size + ']/day'
+            yield (key + 'Days',
+                   ';'.join(self.find(stem + prop + '[' + day + ']')
+                            for day in days))
+
+    # Implementation details
+    __nameForms = (
+        ('standaloneLong', 'stand-alone', 'wide'),
+        ('standaloneShort', 'stand-alone', 'abbreviated'),
+        ('standaloneNarrow', 'stand-alone', 'narrow'),
+        ('long', 'format', 'wide'),
+        ('short', 'format', 'abbreviated'),
+        ('narrow', 'format', 'narrow'),
+        ) # Used for month and day names
+
+    def __find(self, xpath):
+        retries = [ xpath.split('/') ]
+        while retries:
+            tags, elts, roots = retries.pop(), self.nodes, (self.base.root,)
+            for selector in tags:
+                tag, attrs = _parseXPath(selector)
+                elts = tuple(_iterateEach(e.findAllChildren(tag, attrs) for e in elts))
+                if not elts:
+                    break
+
+            else: # Found matching elements
+                # Possibly filter elts to prefer the least drafty ?
+                for elt in elts:
+                    yield elt
+
+            # Process roots separately: otherwise the alias-processing
+            # is excessive.
+            for i, selector in enumerate(tags):
+                tag, attrs = _parseXPath(selector)
+
+                for alias in tuple(_iterateEach(r.findAllChildren('alias', allDull=True)
+                                                for r in roots)):
+                    if alias.dom.attributes['source'].nodeValue == 'locale':
+                        replace = alias.dom.attributes['path'].nodeValue.split('/')
+                        retries.append(self.__xpathJoin(tags[:i], replace, tags[i:]))
+
+                roots = tuple(_iterateEach(r.findAllChildren(tag, attrs) for r in roots))
+                if not roots:
+                    if retries: # Let outer loop fall back on an alias path:
+                        break
+                    sought = '/'.join(tags)
+                    if sought != xpath:
+                        sought += ' (for {})'.format(xpath)
+                    raise Error('All lack child {} for {} in {}'.format(
+                            selector, sought, self.name))
+
+            else: # Found matching elements
+                for elt in roots:
+                    yield elt
+
+        sought = '/'.join(tags)
+        if sought != xpath:
+            sought += ' (for {})'.format(xpath)
+        raise Error('No {} in {}'.format(sought, self.name))
+
+    def __findUnit(self, keySuffix, quantify, fallback=''):
+        # The displayName for a quantified unit in en.xml is kByte
+        # (even for unitLength[narrow]) instead of kB (etc.), so
+        # prefer any unitPattern provided, but prune its placeholder:
+        for size in ('short', 'narrow'): # TODO: reverse order ?
+            stem = 'units/unitLength[{}]/unit[digital-{}byte]/'.format(size + keySuffix, quantify)
+            for count in ('many', 'few', 'two', 'other', 'zero', 'one'):
+                try:
+                    ans = self.find(stem + 'unitPattern[count={}]'.format(count))
+                except Error:
+                    continue
+
+                # TODO: do count-handling, instead of discarding placeholders
+                if False: # TODO: do it this way, instead !
+                    ans = ans.replace('{0}', '').strip()
+                elif ans.startswith('{0}'):
+                    ans = ans[3:].lstrip()
+                if ans:
+                    return ans
+
+            try:
+                return self.find(stem + 'displayName')
+            except Error:
+                pass
+
+        return fallback
+
+    def __unitCount(self, keySuffix, suffix, cache,
+                    # Stop at exa/exbi: 16 exbi = 2^{64} < zetta =
+                    # 1000^7 < zebi = 2^{70}, the next quantifiers up:
+                    siQuantifiers = ('kilo', 'mega', 'giga', 'tera', 'peta', 'exa')):
+        """Work out the unit quantifiers.
+
+        Unfortunately, the CLDR data only go up to terabytes and we
+        want all the way to exabytes; but we can recognize the SI
+        quantifiers as prefixes, strip and identify the tail as the
+        localized translation for 'B' (e.g. French has 'octet' for
+        'byte' and uses ko, Mo, Go, To from which we can extrapolate
+        Po, Eo).
+
+        Should be called first for the SI quantifiers, with suffix =
+        'B', then for the IEC ones, with suffix = 'iB'; the list cache
+        (initially empty before first call) is used to let the second
+        call know what the first learned about the localized unit.
+        """
+        if suffix == 'iB': # second call, re-using first's cache
+            if cache:
+                byte = cache.pop()
+                if all(byte == k for k in cache):
+                    suffix = 'i' + byte
+            for q in siQuantifiers:
+                # Those don't (yet, v36) exist in CLDR, so we always get the fall-back:
+                yield self.__findUnit(keySuffix, q[:2], q[0].upper() + suffix)
+        else: # first call
+            tail = suffix = suffix or 'B'
+            for q in siQuantifiers:
+                it = self.__findUnit(keySuffix, q)
+                # kB for kilobyte, in contrast with KiB for IEC:
+                q = q[0] if q == 'kilo' else q[0].upper()
+                if not it:
+                    it = q + tail
+                elif it.startswith(q):
+                    rest = it[1:]
+                    tail = rest if all(rest == k for k in cache) else suffix
+                    cache.append(rest)
+                yield it
+
+    @staticmethod
+    def __currencyFormats(patterns, plus, minus):
+        for p in patterns.split(';'):
+            p = p.replace('0', '#').replace(',', '').replace('.', '')
+            try:
+                cut = p.find('#') + 1
+            except ValueError:
+                pass
+            else:
+                p = p[:cut] + p[cut:].replace('#', '')
+            p = p.replace('#', "%1")
+            # According to http://www.unicode.org/reports/tr35/#Number_Format_Patterns
+            # there can be doubled or trippled currency sign, however none of the
+            # locales use that.
+            p = p.replace(u'\xa4', "%2")
+            # Single quote goes away, but double goes to single:
+            p = p.replace("''", '###').replace("'", '').replace('###', "'")
+            # Use number system's signs:
+            p = p.replace('+', plus).replace('-', minus)
+            yield p
+
+    @staticmethod
+    def __fromLdmlListPattern(pattern):
+        # This is a very limited parsing of the format for list pattern part only.
+        return pattern.replace('{0}', '%1').replace('{1}', '%2').replace('{2}', '%3')
+
+    @staticmethod
+    def __fromLdmlPath(seq): # tool function for __xpathJoin()
+        """Convert LDML's [@name='value'] to our [name=value] form."""
+        for it in seq:
+            # First dismember it:
+            attrs = it.split('[')
+            tag = attrs.pop(0)
+            if not attrs: # Short-cut the easy case:
+                yield it
+                continue
+
+            assert all(x.endswith(']') for x in attrs)
+            attrs = [x[:-1].split('=') for x in attrs]
+            # Then fix each attribute specification in it:
+            attrs = [(x[0][1:] if x[0].startswith('@') else x[0],
+                      x[1][1:-1] if x[1].startswith("'") and x[1].endswith("'") else x[1])
+                     for x in attrs]
+            # Finally, put it all back together:
+            attrs = ['='.join(x) + ']' for x in attrs]
+            attrs.insert(0, tag)
+            yield '['.join(attrs)
+
+    @classmethod
+    def __xpathJoin(cls, head, insert, tail):
+        """Join three lists of XPath selectors.
+
+        Each of head, insert and tail is a sequence of selectors but
+        insert may start with some uses of '..', that we want to
+        resolve away, and may use LDML's attribute format, that we
+        want to convert to our format."""
+        while insert and insert[0] == '..':
+            insert.pop(0)
+            head.pop()
+        return head + list(cls.__fromLdmlPath(insert)) + tail
diff --git a/util/locale_database/localetools.py b/util/locale_database/localetools.py
new file mode 100644
index 0000000000..29153366b3
--- /dev/null
+++ b/util/locale_database/localetools.py
@@ -0,0 +1,164 @@
+#############################################################################
+##
+## Copyright (C) 2020 The Qt Company Ltd.
+## Contact: https://www.qt.io/licensing/
+##
+## This file is part of the test suite of the Qt Toolkit.
+##
+## $QT_BEGIN_LICENSE:GPL-EXCEPT$
+## Commercial License Usage
+## Licensees holding valid commercial Qt licenses may use this file in
+## accordance with the commercial license agreement provided with the
+## Software or, alternatively, in accordance with the terms contained in
+## a written agreement between you and The Qt Company. For licensing terms
+## and conditions see https://www.qt.io/terms-conditions. For further
+## information use the contact form at https://www.qt.io/contact-us.
+##
+## GNU General Public License Usage
+## Alternatively, this file may be used under the terms of the GNU
+## General Public License version 3 as published by the Free Software
+## Foundation with exceptions as appearing in the file LICENSE.GPL3-EXCEPT
+## included in the packaging of this file. Please review the following
+## information to ensure the GNU General Public License requirements will
+## be met: https://www.gnu.org/licenses/gpl-3.0.html.
+##
+## $QT_END_LICENSE$
+##
+#############################################################################
+"""Utilities shared among the CLDR extraction tools.
+
+Functions:
+  unicode2hex() -- converts unicode text to UCS-2 in hex form.
+  wrap_list() -- map list to comma-separated string, 20 entries per line.
+
+Classes:
+  Error -- A shared error class.
+  Transcriber -- edit a file by writing a temporary file, then renaming.
+  SourceFileEditor -- adds standard prelude and tail handling to Transcriber.
+"""
+
+import os
+import tempfile
+
+class Error (StandardError):
+    __upinit = StandardError.__init__
+    def __init__(self, msg, *args):
+        self.__upinit(msg, *args)
+        self.message = msg
+    def __str__(self):
+        return self.message
+
+def unicode2hex(s):
+    lst = []
+    for x in s:
+        v = ord(x)
+        if v > 0xFFFF:
+            # make a surrogate pair
+            # copied from qchar.h
+            high = (v >> 10) + 0xd7c0
+            low = (v % 0x400 + 0xdc00)
+            lst.append(hex(high))
+            lst.append(hex(low))
+        else:
+            lst.append(hex(v))
+    return lst
+
+def wrap_list(lst):
+    def split(lst, size):
+        while lst:
+            head, lst = lst[:size], lst[size:]
+            yield head
+    return ",\n".join(", ".join(x) for x in split(lst, 20))
+
+class Transcriber (object):
+    """Helper class to facilitate rewriting source files.
+
+    This class takes care of the temporary file manipulation. Derived
+    classes need to implement transcribing of the content, with
+    whatever modifications they may want.  Members reader and writer
+    are exposed; use writer.write() to output to the new file; use
+    reader.readline() or iterate reader to read the original.
+
+    Callers should call close() on success or cleanup() on failure (to
+    clear away the temporary file).
+    """
+    def __init__(self, path, temp):
+        # Open the old file
+        self.reader = open(path)
+        # Create a temp file to write the new data into
+        temp, tempPath = tempfile.mkstemp(os.path.split(path)[1], dir = temp)
+        self.__names = path, tempPath
+        self.writer = os.fdopen(temp, "w")
+
+    def close(self):
+        self.reader.close()
+        self.writer.close()
+        self.reader = self.writer = None
+        source, temp = self.__names
+        os.remove(source)
+        os.rename(temp, source)
+
+    def cleanup(self):
+        if self.__names:
+            self.reader.close()
+            self.writer.close()
+            # Remove temp-file:
+            os.remove(self.__names[1])
+            self.__names = ()
+
+class SourceFileEditor (Transcriber):
+    """Transcriber with transcription of code around a gnerated block.
+
+    We have a common pattern of source files with a generated part
+    embedded in a context that's not touched by the regeneration
+    scripts. The generated part is, in each case, marked with a common
+    pair of start and end markers. We transcribe the old file to a new
+    temporary file; on success, we then remove the original and move
+    the new version to replace it.
+
+    This class takes care of transcribing the parts before and after
+    the generated content; on creation, an instance will copy the
+    preamble up to the start marker; its close() will skip over the
+    original's generated content and resume transcribing with the end
+    marker. Derived classes need only implement the generation of the
+    content in between.
+
+    Callers should call close() on success or cleanup() on failure (to
+    clear away the temporary file); see Transcriber.
+    """
+    __upinit = Transcriber.__init__
+    def __init__(self, path, temp):
+        """Set up the source file editor.
+
+        Requires two arguments: the path to the source file to be read
+        and, on success, replaced with a new version; and the
+        directory in which to store the temporary file during the
+        rewrite."""
+        self.__upinit(path, temp)
+        self.__copyPrelude()
+
+    __upclose = Transcriber.close
+    def close(self):
+        self.__copyTail()
+        self.__upclose()
+
+    # Implementation details:
+    GENERATED_BLOCK_START = '// GENERATED PART STARTS HERE'
+    GENERATED_BLOCK_END = '// GENERATED PART ENDS HERE'
+
+    def __copyPrelude(self):
+        # Copy over the first non-generated section to the new file
+        for line in self.reader:
+            self.writer.write(line)
+            if line.strip() == self.GENERATED_BLOCK_START:
+                break
+
+    def __copyTail(self):
+        # Skip through the old generated data in the old file
+        for line in self.reader:
+            if line.strip() == self.GENERATED_BLOCK_END:
+                self.writer.write(line)
+                break
+        # Transcribe the remainder:
+        for line in self.reader:
+            self.writer.write(line)
diff --git a/util/locale_database/qlocalexml.py b/util/locale_database/qlocalexml.py
index 0a4628e05e..550021ba01 100644
--- a/util/locale_database/qlocalexml.py
+++ b/util/locale_database/qlocalexml.py
@@ -28,11 +28,18 @@
 #############################################################################
 """Shared serialization-scanning code for QLocaleXML format.
 
-The Locale class is written by cldr2qlocalexml.py and read by qlocalexml2cpp.py
+Provides classes:
+  Locale -- common data-type representing one locale as a namespace
+  QLocaleXmlWriter -- helper to write a QLocaleXML file
+  QLocaleXmlReader -- helper to read a QLocaleXML file back in
+
+Support:
+  Spacer -- provides control over indentation of the output.
 """
+from __future__ import print_function
 from xml.sax.saxutils import escape
 
-import xpathlite
+from localetools import Error
 
 # Tools used by Locale:
 def camel(seq):
@@ -43,6 +50,10 @@ def camel(seq):
 def camelCase(words):
     return ''.join(camel(iter(words)))
 
+def addEscapes(s):
+    return ''.join(c if n < 128 else '\\x{:02x}'.format(n)
+                   for n, c in ((ord(c), c) for c in s))
+
 def startCount(c, text): # strspn
     """First index in text where it doesn't have a character in c"""
     assert text and text[0] in c
@@ -58,6 +69,8 @@ def convertFormat(format):
     * https://www.unicode.org/reports/tr35/tr35-dates.html#Date_Field_Symbol_Table
     * QDateTimeParser::parseFormat() and QLocalePrivate::dateTimeToString()
     """
+    # Compare and contrast dateconverter.py's convert_date().
+    # Need to (check consistency and) reduce redundancy !
     result = ""
     i = 0
     while i < len(format):
@@ -102,7 +115,314 @@ def convertFormat(format):
 
     return result
 
-class Locale:
+class QLocaleXmlReader (object):
+    def __init__(self, filename):
+        self.root = self.__parse(filename)
+        # Lists of (id, name, code) triples:
+        languages = tuple(self.__loadMap('language'))
+        scripts = tuple(self.__loadMap('script'))
+        countries = tuple(self.__loadMap('country'))
+        self.__likely = tuple(self.__likelySubtagsMap())
+        # Mappings {ID: (name, code)}
+        self.languages = dict((v[0], v[1:]) for v in languages)
+        self.scripts = dict((v[0], v[1:]) for v in scripts)
+        self.countries = dict((v[0], v[1:]) for v in countries)
+        # Private mappings {name: (ID, code)}
+        self.__langByName = dict((v[1], (v[0], v[2])) for v in languages)
+        self.__textByName = dict((v[1], (v[0], v[2])) for v in scripts)
+        self.__landByName = dict((v[1], (v[0], v[2])) for v in countries)
+        # Other properties:
+        self.dupes = set(v[1] for v in languages) & set(v[1] for v in countries)
+        self.cldrVersion = self.__firstChildText(self.root, "version")
+
+    def loadLocaleMap(self, calendars, grumble = lambda text: None):
+        kid = self.__firstChildText
+        likely = dict(self.__likely)
+        for elt in self.__eachEltInGroup(self.root, 'localeList', 'locale'):
+            locale = Locale.fromXmlData(lambda k: kid(elt, k), calendars)
+            language = self.__langByName[locale.language][0]
+            script = self.__textByName[locale.script][0]
+            country = self.__landByName[locale.country][0]
+
+            if language != 1: # C
+                if country == 0:
+                    grumble('loadLocaleMap: No country id for "{}"\n'.format(locale.language))
+
+                if script == 0:
+                    # Find default script for the given language and country - see:
+                    # http://www.unicode.org/reports/tr35/#Likely_Subtags
+                    try:
+                        try:
+                            to = likely[(locale.language, 'AnyScript', locale.country)]
+                        except KeyError:
+                            to = likely[(locale.language, 'AnyScript', 'AnyCountry')]
+                    except KeyError:
+                        pass
+                    else:
+                        locale.script = to[1]
+                        script = self.__textByName[locale.script][0]
+
+            yield (language, script, country), locale
+
+    def languageIndices(self, locales):
+        index = 0
+        for key, value in self.languages.iteritems():
+            i, count = 0, locales.count(key)
+            if count > 0:
+                i = index
+                index += count
+            yield i, value[0]
+
+    def likelyMap(self):
+        def tag(t):
+            lang, script, land = t
+            yield lang[1] if lang[0] else 'und'
+            if script[0]: yield script[1]
+            if land[0]: yield land[1]
+
+        def ids(t):
+            return tuple(x[0] for x in t)
+
+        for i, pair in enumerate(self.__likely, 1):
+            have = self.__fromNames(pair[0])
+            give = self.__fromNames(pair[1])
+            yield ('_'.join(tag(have)), ids(have),
+                   '_'.join(tag(give)), ids(give),
+                   i == len(self.__likely))
+
+    def defaultMap(self):
+        """Map language and script to their default country by ID.
+
+        Yields ((language, script), country) wherever the likely
+        sub-tags mapping says language's default locale uses the given
+        script and country."""
+        for have, give in self.__likely:
+            if have[1:] == ('AnyScript', 'AnyCountry') and give[2] != 'AnyCountry':
+                assert have[0] == give[0], (have, give)
+                yield ((self.__langByName[give[0]][0],
+                        self.__textByName[give[1]][0]),
+                       self.__landByName[give[2]][0])
+
+    # Implementation details:
+    def __loadMap(self, category):
+        kid = self.__firstChildText
+        for element in self.__eachEltInGroup(self.root, category + 'List', category):
+            yield int(kid(element, 'id')), kid(element, 'name'), kid(element, 'code')
+
+    def __likelySubtagsMap(self):
+        def triplet(element, keys=('language', 'script', 'country'), kid = self.__firstChildText):
+            return tuple(kid(element, key) for key in keys)
+
+        kid = self.__firstChildElt
+        for elt in self.__eachEltInGroup(self.root, 'likelySubtags', 'likelySubtag'):
+            yield triplet(kid(elt, "from")), triplet(kid(elt, "to"))
+
+    def __fromNames(self, names):
+        return self.__langByName[names[0]], self.__textByName[names[1]], self.__landByName[names[2]]
+
+    # DOM access:
+    from xml.dom import minidom
+    @staticmethod
+    def __parse(filename, read = minidom.parse):
+        return read(filename).documentElement
+
+    @staticmethod
+    def __isNodeNamed(elt, name, TYPE=minidom.Node.ELEMENT_NODE):
+        return elt.nodeType == TYPE and elt.nodeName == name
+    del minidom
+
+    @staticmethod
+    def __eltWords(elt):
+        child = elt.firstChild
+        while child:
+            if child.nodeType == elt.TEXT_NODE:
+                yield child.nodeValue
+            child = child.nextSibling
+
+    @classmethod
+    def __firstChildElt(cls, parent, name):
+        child = parent.firstChild
+        while child:
+            if cls.__isNodeNamed(child, name):
+                return child
+            child = child.nextSibling
+
+        raise Error('No {} child found'.format(name))
+
+    @classmethod
+    def __firstChildText(cls, elt, key):
+        return ' '.join(cls.__eltWords(cls.__firstChildElt(elt, key)))
+
+    @classmethod
+    def __eachEltInGroup(cls, parent, group, key):
+        try:
+            element = cls.__firstChildElt(parent, group).firstChild
+        except Error:
+            element = None
+
+        while element:
+            if cls.__isNodeNamed(element, key):
+                yield element
+            element = element.nextSibling
+
+
+class Spacer (object):
+    def __init__(self, indent = None, initial = ''):
+        """Prepare to manage indentation and line breaks.
+
+        Arguments are both optional.
+
+        First argument, indent, is either None (its default, for
+        'minifying'), an ingeter (number of spaces) or the unit of
+        text that is to be used for each indentation level (e.g. '\t'
+        to use tabs).  If indent is None, no indentation is added, nor
+        are line-breaks; otherwise, self(text), for non-empty text,
+        shall end with a newline and begin with indentation.
+
+        Second argument, initial, is the initial indentation; it is
+        ignored if indent is None.  Indentation increases after each
+        call to self(text) in which text starts with a tag and doesn't
+        include its end-tag; indentation decreases if text starts with
+        an end-tag.  The text is not parsed any more carefully than
+        just described.
+        """
+        if indent is None:
+            self.__call = lambda x: x
+        else:
+            self.__each = ' ' * indent if isinstance(indent, int) else indent
+            self.current = initial
+            self.__call = self.__wrap
+
+    def __wrap(self, line):
+        if not line:
+            return '\n'
+
+        indent = self.current
+        if line.startswith('</'):
+            indent = self.current = indent[:-len(self.__each)]
+        elif line.startswith('<') and not line.startswith('<!'):
+            cut = line.find('>')
+            tag = (line[1:] if cut < 0 else line[1 : cut]).strip().split()[0]
+            if '</{}>'.format(tag) not in line:
+                self.current += self.__each
+        return indent + line + '\n'
+
+    def __call__(self, line):
+        return self.__call(line)
+
+class QLocaleXmlWriter (object):
+    def __init__(self, save = None, space = Spacer(4)):
+        """Set up to write digested CLDR data as QLocale XML.
+
+        Arguments are both optional.
+
+        First argument, save, is None (its default) or a callable that
+        will write content to where you intend to save it. If None, it
+        is replaced with a callable that prints the given content,
+        suppressing the newline (but see the following); this is
+        equivalent to passing sys.stdout.write.
+
+        Second argument, space, is an object to call on each text
+        output to prepend indentation and append newlines, or not as
+        the case may be. The default is a Spacer(4), which grows
+        indent by four spaces after each unmatched new tag and shrinks
+        back on a close-tag (its parsing is naive, but adequate to how
+        this class uses it), while adding a newline to each line.
+        """
+        self.__rawOutput = self.__printit if save is None else save
+        self.__wrap = space
+        self.__write('<localeDatabase>')
+
+    # Output of various sections, in their usual order:
+    def enumData(self, languages, scripts, countries):
+        self.__enumTable('languageList', languages)
+        self.__enumTable('scriptList', scripts)
+        self.__enumTable('countryList', countries)
+
+    def likelySubTags(self, entries):
+        self.__openTag('likelySubtags')
+        for have, give in entries:
+            self.__openTag('likelySubtag')
+            self.__likelySubTag('from', have)
+            self.__likelySubTag('to', give)
+            self.__closeTag('likelySubtag')
+        self.__closeTag('likelySubtags')
+
+    def locales(self, locales, calendars):
+        self.__openTag('localeList')
+        self.__openTag('locale')
+        Locale.C(calendars).toXml(self.inTag, calendars)
+        self.__closeTag('locale')
+        keys = locales.keys()
+        keys.sort()
+        for key in keys:
+            self.__openTag('locale')
+            locales[key].toXml(self.inTag, calendars)
+            self.__closeTag('locale')
+        self.__closeTag('localeList')
+
+    def version(self, cldrVersion):
+        self.inTag('version', cldrVersion)
+
+    def inTag(self, tag, text):
+        self.__write('<{0}>{1}</{0}>'.format(tag, text))
+
+    def close(self):
+        if self.__rawOutput != self.__complain:
+            self.__write('</localeDatabase>')
+        self.__rawOutput = self.__complain
+
+    # Implementation details
+    @staticmethod
+    def __printit(text):
+        print(text, end='')
+    @staticmethod
+    def __complain(text):
+        raise Error('Attempted to write data after closing :-(')
+
+    def __enumTable(self, tag, table):
+        self.__openTag(tag)
+        for key, value in table.iteritems():
+            self.__openTag(tag[:-4])
+            self.inTag('name', value[0])
+            self.inTag('id', key)
+            self.inTag('code', value[1])
+            self.__closeTag(tag[:-4])
+        self.__closeTag(tag)
+
+    def __likelySubTag(self, tag, likely):
+        self.__openTag(tag)
+        self.inTag('language', likely[0])
+        self.inTag('script', likely[1])
+        self.inTag('country', likely[2])
+        # self.inTag('variant', likely[3])
+        self.__closeTag(tag)
+
+    def __openTag(self, tag):
+        self.__write('<{}>'.format(tag))
+    def __closeTag(self, tag):
+        self.__write('</{}>'.format(tag))
+
+    def __write(self, line):
+        self.__rawOutput(self.__wrap(line))
+
+class Locale (object):
+    """Holder for the assorted data representing one locale.
+
+    Implemented as a namespace; its constructor and update() have the
+    same signatures as those of a dict, acting on the instance's
+    __dict__, so the results are accessed as attributes rather than
+    mapping keys."""
+    def __init__(self, data=None, **kw):
+        self.update(data, **kw)
+
+    def update(self, data=None, **kw):
+        if data: self.__dict__.update(data)
+        if kw: self.__dict__.update(kw)
+
+    def __len__(self): # Used when testing as a boolean
+        return len(self.__dict__)
+
     @staticmethod
     def propsMonthDay(scale, lengths=('long', 'short', 'narrow')):
         for L in lengths:
@@ -158,16 +478,24 @@ class Locale:
 
         return cls(data)
 
-    def toXml(self, calendars=('gregorian',), indent='        ', tab='    '):
-        print indent + '<locale>'
-        inner = indent + tab
+    def toXml(self, write, calendars=('gregorian',)):
+        """Writes its data as QLocale XML.
+
+        First argument, write, is a callable taking the name and
+        content of an XML element; it is expected to be the inTag
+        bound method of a QLocaleXmlWriter instance.
+
+        Optional second argument is a list of calendar names, in the
+        form used by CLDR; its default is ('gregorian',).
+        """
         get = lambda k: getattr(self, k)
         for key in ('language', 'script', 'country'):
-            print inner + "<%s>" % key + get(key) + "</%s>" % key
-            print inner + "<%scode>" % key + get(key + '_code') + "</%scode>" % key
+            write(key, get(key))
+            write('{}code'.format(key), get('{}_code'.format(key)))
 
-        for key in ('decimal', 'group', 'zero', 'list', 'percent', 'minus', 'plus', 'exp'):
-            print inner + "<%s>" % key + get(key) + "</%s>" % key
+        for key in ('decimal', 'group', 'zero', 'list',
+                    'percent', 'minus', 'plus', 'exp'):
+            write(key, get(key))
 
         for key in ('languageEndonym', 'countryEndonym',
                     'quotationStart', 'quotationEnd',
@@ -185,16 +513,10 @@ class Locale:
                 '_'.join((k, cal))
                 for k in self.propsMonthDay('months')
                 for cal in calendars):
-            print inner + "<%s>%s</%s>" % (key, escape(get(key)).encode('utf-8'), key)
+            write(key, escape(get(key)).encode('utf-8'))
 
         for key in ('currencyDigits', 'currencyRounding'):
-            print inner + "<%s>%d</%s>" % (key, get(key), key)
-
-        print indent + "</locale>"
-
-    def __init__(self, data=None, **kw):
-        if data: self.__dict__.update(data)
-        if kw: self.__dict__.update(kw)
+            write(key, get(key))
 
     # Tools used by __monthNames:
     def fullName(i, name): return name
@@ -213,6 +535,9 @@ class Locale:
     @staticmethod
     def __monthNames(calendars,
                      known={ # Map calendar to (names, extractors...):
+            # TODO: do we even need these ?  CLDR's root.xml seems to
+            # have them, complete with yeartype="leap" handling for
+            # Hebrew's extra.
             'gregorian': (('January', 'February', 'March', 'April', 'May', 'June', 'July',
                            'August', 'September', 'October', 'November', 'December'),
                           # Extractor pairs, (plain, standalone)
@@ -240,8 +565,8 @@ class Locale:
         for cal in calendars:
             try:
                 data = known[cal]
-            except KeyError: # Need to add an entry to known, above.
-                print 'Unsupported calendar:', cal
+            except KeyError as e: # Need to add an entry to known, above.
+                e.args += ('Unsupported calendar:', cal)
                 raise
             names, get = data[0], data[1:]
             for n, size in enumerate(sizes):
@@ -253,12 +578,11 @@ class Locale:
 
     @classmethod
     def C(cls, calendars=('gregorian',),
-          # Empty entry at end to ensure final separator when join()ed:
           days = ('Sunday', 'Monday', 'Tuesday', 'Wednesday',
                   'Thursday', 'Friday', 'Saturday'),
           quantifiers=('k', 'M', 'G', 'T', 'P', 'E')):
         """Returns an object representing the C locale."""
-        return cls(dict(cls.__monthNames(calendars)),
+        return cls(cls.__monthNames(calendars),
                    language='C', language_code='0', languageEndonym='',
                    script='AnyScript', script_code='0',
                    country='AnyCountry', country_code='0', countryEndonym='',
diff --git a/util/locale_database/qlocalexml2cpp.py b/util/locale_database/qlocalexml2cpp.py
index 3dde298f47..db45ab2778 100755
--- a/util/locale_database/qlocalexml2cpp.py
+++ b/util/locale_database/qlocalexml2cpp.py
@@ -34,238 +34,53 @@ the root of the qtbase check-out as second parameter.
 """
 
 import os
-import sys
-import tempfile
 import datetime
-import xml.dom.minidom
-from enumdata import language_aliases, country_aliases, script_aliases
 
-from qlocalexml import Locale
+from qlocalexml import QLocaleXmlReader
+from xml.dom import minidom
+from localetools import unicode2hex, wrap_list, Error, Transcriber, SourceFileEditor
 
-# TODO: Make calendars a command-line parameter
-# map { CLDR name: Qt file name }
-calendars = {'gregorian': 'roman', 'persian': 'jalali', 'islamic': 'hijri',} # 'hebrew': 'hebrew',
-
-generated_template = """
-/*
-    This part of the file was generated on %s from the
-    Common Locale Data Repository v%s
-
-    http://www.unicode.org/cldr/
-
-    Do not edit this section: instead regenerate it using
-    cldr2qlocalexml.py and qlocalexml2cpp.py on updated (or
-    edited) CLDR data; see qtbase/util/locale_database/.
-*/
-
-"""
-
-class Error:
-    def __init__(self, msg):
-        self.msg = msg
-    def __str__(self):
-        return self.msg
-
-def wrap_list(lst):
-    def split(lst, size):
-        while lst:
-            head, lst = lst[:size], lst[size:]
-            yield head
-    return ",\n".join(", ".join(x) for x in split(lst, 20))
-
-def isNodeNamed(elt, name, TYPE=xml.dom.minidom.Node.ELEMENT_NODE):
-    return elt.nodeType == TYPE and elt.nodeName == name
-
-def firstChildElt(parent, name):
-    child = parent.firstChild
-    while child:
-        if isNodeNamed(child, name):
-            return child
-        child = child.nextSibling
+def compareLocaleKeys(key1, key2):
+    if key1 == key2:
+        return 0
 
-    raise Error('No %s child found' % name)
+    if key1[0] != key2[0]: # First sort by language:
+        return key1[0] - key2[0]
 
-def eachEltInGroup(parent, group, key):
+    defaults = compareLocaleKeys.default_map
+    # maps {(language, script): country} by ID
     try:
-        element = firstChildElt(parent, group).firstChild
-    except Error:
-        element = None
-
-    while element:
-        if isNodeNamed(element, key):
-            yield element
-        element = element.nextSibling
-
-def eltWords(elt):
-    child = elt.firstChild
-    while child:
-        if child.nodeType == elt.TEXT_NODE:
-            yield child.nodeValue
-        child = child.nextSibling
-
-def firstChildText(elt, key):
-    return ' '.join(eltWords(firstChildElt(elt, key)))
-
-def loadMap(doc, category):
-    return dict((int(firstChildText(element, 'id')),
-                 (firstChildText(element, 'name'),
-                  firstChildText(element, 'code')))
-                for element in eachEltInGroup(doc.documentElement,
-                                              category + 'List', category))
-
-def loadLikelySubtagsMap(doc):
-    def triplet(element, keys=('language', 'script', 'country')):
-        return tuple(firstChildText(element, key) for key in keys)
-
-    return dict((i, {'from': triplet(firstChildElt(elt, "from")),
-                     'to': triplet(firstChildElt(elt, "to"))})
-                for i, elt in enumerate(eachEltInGroup(doc.documentElement,
-                                                       'likelySubtags', 'likelySubtag')))
-
-def fixedScriptName(name, dupes):
-    # Don't .capitalize() as some names are already camel-case (see enumdata.py):
-    name = ''.join(word[0].upper() + word[1:] for word in name.split())
-    if name[-6:] != "Script":
-        name = name + "Script"
-    if name in dupes:
-        sys.stderr.write("\n\n\nERROR: The script name '%s' is messy" % name)
-        sys.exit(1)
-    return name
-
-def fixedCountryName(name, dupes):
-    if name in dupes:
-        return name.replace(" ", "") + "Country"
-    return name.replace(" ", "")
-
-def fixedLanguageName(name, dupes):
-    if name in dupes:
-        return name.replace(" ", "") + "Language"
-    return name.replace(" ", "")
-
-def findDupes(country_map, language_map):
-    country_set = set(v[0] for a, v in country_map.iteritems())
-    language_set = set(v[0] for a, v in language_map.iteritems())
-    return country_set & language_set
-
-def languageNameToId(name, language_map):
-    for key in language_map.keys():
-        if language_map[key][0] == name:
-            return key
-    return -1
-
-def scriptNameToId(name, script_map):
-    for key in script_map.keys():
-        if script_map[key][0] == name:
-            return key
-    return -1
-
-def countryNameToId(name, country_map):
-    for key in country_map.keys():
-        if country_map[key][0] == name:
-            return key
-    return -1
-
-def loadLocaleMap(doc, language_map, script_map, country_map, likely_subtags_map):
-    result = {}
-
-    for locale_elt in eachEltInGroup(doc.documentElement, "localeList", "locale"):
-        locale = Locale.fromXmlData(lambda k: firstChildText(locale_elt, k), calendars.keys())
-        language_id = languageNameToId(locale.language, language_map)
-        if language_id == -1:
-            sys.stderr.write("Cannot find a language id for '%s'\n" % locale.language)
-        script_id = scriptNameToId(locale.script, script_map)
-        if script_id == -1:
-            sys.stderr.write("Cannot find a script id for '%s'\n" % locale.script)
-        country_id = countryNameToId(locale.country, country_map)
-        if country_id == -1:
-            sys.stderr.write("Cannot find a country id for '%s'\n" % locale.country)
-
-        if language_id != 1: # C
-            if country_id == 0:
-                sys.stderr.write("loadLocaleMap: No country id for '%s'\n" % locale.language)
-
-            if script_id == 0:
-                # find default script for a given language and country (see http://www.unicode.org/reports/tr35/#Likely_Subtags)
-                for key in likely_subtags_map.keys():
-                    tmp = likely_subtags_map[key]
-                    if tmp["from"][0] == locale.language and tmp["from"][1] == "AnyScript" and tmp["from"][2] == locale.country:
-                        locale.script = tmp["to"][1]
-                        script_id = scriptNameToId(locale.script, script_map)
-                        break
-            if script_id == 0 and country_id != 0:
-                # try with no country
-                for key in likely_subtags_map.keys():
-                    tmp = likely_subtags_map[key]
-                    if tmp["from"][0] == locale.language and tmp["from"][1] == "AnyScript" and tmp["from"][2] == "AnyCountry":
-                        locale.script = tmp["to"][1]
-                        script_id = scriptNameToId(locale.script, script_map)
-                        break
-
-        result[(language_id, script_id, country_id)] = locale
-
-    return result
+        country = defaults[key1[:2]]
+    except KeyError:
+        pass
+    else:
+        if key1[2] == country:
+            return -1
+        if key2[2] == country:
+            return 1
 
-def compareLocaleKeys(key1, key2):
-    if key1 == key2:
-        return 0
+    if key1[1] == key2[1]:
+        return key1[2] - key2[2]
 
-    if key1[0] == key2[0]:
-        l1 = compareLocaleKeys.locale_map[key1]
-        l2 = compareLocaleKeys.locale_map[key2]
-
-        if (l1.language, l1.script) in compareLocaleKeys.default_map.keys():
-            default = compareLocaleKeys.default_map[(l1.language, l1.script)]
-            if l1.country == default:
-                return -1
-            if l2.country == default:
-                return 1
-
-        if key1[1] != key2[1]:
-            if (l2.language, l2.script) in compareLocaleKeys.default_map.keys():
-                default = compareLocaleKeys.default_map[(l2.language, l2.script)]
-                if l2.country == default:
-                    return 1
-                if l1.country == default:
-                    return -1
-
-        if key1[1] != key2[1]:
-            return key1[1] - key2[1]
+    try:
+        country = defaults[key2[:2]]
+    except KeyError:
+        pass
     else:
-        return key1[0] - key2[0]
+        if key2[2] == country:
+            return 1
+        if key1[2] == country:
+            return -1
+
+    return key1[1] - key2[1]
 
-    return key1[2] - key2[2]
-
-
-def languageCount(language_id, locale_map):
-    result = 0
-    for key in locale_map.keys():
-        if key[0] == language_id:
-            result += 1
-    return result
-
-def unicode2hex(s):
-    lst = []
-    for x in s:
-        v = ord(x)
-        if v > 0xFFFF:
-            # make a surrogate pair
-            # copied from qchar.h
-            high = (v >> 10) + 0xd7c0
-            low = (v % 0x400 + 0xdc00)
-            lst.append(hex(high))
-            lst.append(hex(low))
-        else:
-            lst.append(hex(v))
-    return lst
 
 class StringDataToken:
     def __init__(self, index, length, bits):
         if index > 0xffff:
-            print "\n\n\n#error Data index is too big!", index
-            raise ValueError("Start-index (%d) exceeds the uint16 range!" % index)
+            raise ValueError('Start-index ({}) exceeds the uint16 range!'.format(index))
         if length >= (1 << bits):
-            print "\n\n\n#error Range length is too big!", length
-            raise ValueError("Data size (%d) exceeds the %d-bit range!" % (length, bits))
+            raise ValueError('Data size ({}) exceeds the {}-bit range!'.format(length, bits))
 
         self.index = index
         self.length = length
@@ -277,7 +92,7 @@ class StringData:
         self.name = name
         self.text = '' # Used in quick-search for matches in data
 
-    def append(self, s, bits=8):
+    def append(self, s, bits = 8):
         try:
             token = self.hash[s]
         except KeyError:
@@ -317,592 +132,481 @@ class StringData:
 
     def write(self, fd):
         if len(self.data) > 0xffff:
-            raise ValueError("Data is too big for quint16 index to its end!" % len(self.data),
+            raise ValueError('Data is too big ({}) for quint16 index to its end!'
+                             .format(len(self.data)),
                              self.name)
-        fd.write("\nstatic const char16_t %s[] = {\n" % self.name)
+        fd.write("\nstatic const char16_t {}[] = {{\n".format(self.name))
         fd.write(wrap_list(self.data))
         fd.write("\n};\n")
 
-def escapedString(s):
-    result = ""
-    i = 0
-    while i < len(s):
-        if s[i] == '"':
-            result += '\\"'
-            i += 1
-        else:
-            result += s[i]
-            i += 1
-    s = result
-
-    line = ""
-    need_escape = False
-    result = ""
-    for c in s:
-        if ord(c) < 128 and not (need_escape and ord('a') <= ord(c.lower()) <= ord('f')):
-            line += c
-            need_escape = False
-        else:
-            line += "\\x%02x" % (ord(c))
-            need_escape = True
-        if len(line) > 80:
-            result = result + "\n" + '"' + line + '"'
-            line = ""
-    line += "\\0"
-    result = result + "\n" + '"' + line + '"'
-    if result[0] == "\n":
-        result = result[1:]
-    return result
-
-def printEscapedString(s):
-    print escapedString(s)
-
 def currencyIsoCodeData(s):
     if s:
         return '{' + ",".join(str(ord(x)) for x in s) + '}'
     return "{0,0,0}"
 
-def usage():
-    print "Usage: qlocalexml2cpp.py <path-to-locale.xml> <path-to-qtbase-src-tree>"
-    sys.exit(1)
+class LocaleSourceEditor (SourceFileEditor):
+    __upinit = SourceFileEditor.__init__
+    def __init__(self, path, temp, version):
+        self.__upinit(path, temp)
+        self.writer.write("""
+/*
+    This part of the file was generated on {} from the
+    Common Locale Data Repository v{}
+
+    http://www.unicode.org/cldr/
+
+    Do not edit this section: instead regenerate it using
+    cldr2qlocalexml.py and qlocalexml2cpp.py on updated (or
+    edited) CLDR data; see qtbase/util/locale_database/.
+*/
 
-GENERATED_BLOCK_START = "// GENERATED PART STARTS HERE\n"
-GENERATED_BLOCK_END = "// GENERATED PART ENDS HERE\n"
+""".format(datetime.date.today(), version))
+
+class LocaleDataWriter (LocaleSourceEditor):
+    def likelySubtags(self, likely):
+        self.writer.write('static const QLocaleId likely_subtags[] = {\n')
+        for had, have, got, give, last in likely:
+            self.writer.write('    {{ {:3d}, {:3d}, {:3d} }}'.format(*have))
+            self.writer.write(', {{ {:3d}, {:3d}, {:3d} }}'.format(*give))
+            self.writer.write(' ' if last else ',')
+            self.writer.write(' // {} -> {}\n'.format(had, got))
+        self.writer.write('};\n\n')
+
+    def localeIndex(self, indices):
+        self.writer.write('static const quint16 locale_index[] = {\n')
+        for pair in indices:
+            self.writer.write('{:6d}, // {}\n'.format(*pair))
+        self.writer.write('     0 // trailing 0\n')
+        self.writer.write('};\n\n')
+
+    def localeData(self, locales, names):
+        list_pattern_part_data = StringData('list_pattern_part_data')
+        single_character_data = StringData('single_character_data')
+        date_format_data = StringData('date_format_data')
+        time_format_data = StringData('time_format_data')
+        days_data = StringData('days_data')
+        am_data = StringData('am_data')
+        pm_data = StringData('pm_data')
+        byte_unit_data = StringData('byte_unit_data')
+        currency_symbol_data = StringData('currency_symbol_data')
+        currency_display_name_data = StringData('currency_display_name_data')
+        currency_format_data = StringData('currency_format_data')
+        endonyms_data = StringData('endonyms_data')
+
+        # Locale data
+        self.writer.write('static const QLocaleData locale_data[] = {\n')
+        # Table headings: keep each label centred in its field, matching line_format:
+        self.writer.write('   // '
+                          # Width 6 + comma
+                          ' lang  ' # IDs
+                          'script '
+                          '  terr '
+
+                          # Range entries (all start-indices, then all sizes)
+                          # Width 5 + comma
+                          'lStrt ' # List pattern
+                          'lpMid '
+                          'lpEnd '
+                          'lPair '
+                          'lDelm ' # List delimiter
+                          # Representing numbers
+                          ' dec  '
+                          'group '
+                          'prcnt '
+                          ' zero '
+                          'minus '
+                          'plus  '
+                          ' exp  '
+                          # Quotation marks
+                          'qtOpn '
+                          'qtEnd '
+                          'altQO '
+                          'altQE '
+                          'lDFmt ' # Date format
+                          'sDFmt '
+                          'lTFmt ' # Time format
+                          'sTFmt '
+                          'slDay ' # Day names
+                          'lDays '
+                          'ssDys '
+                          'sDays '
+                          'snDay '
+                          'nDays '
+                          '  am  ' # am/pm indicators
+                          '  pm  '
+                          ' byte '
+                          'siQnt '
+                          'iecQn '
+                          'crSym ' # Currency formatting
+                          'crDsp '
+                          'crFmt '
+                          'crFNg '
+                          'ntLng ' # Name of language in itself, and of territory
+                          'ntTer '
+                          # Width 3 + comma for each size; no header
+                          + '    ' * 37 +
+
+                          # Strays (char array, bit-fields):
+                          # Width 10 + 2 spaces + comma
+                          '   currISO   '
+                          # Width 6 + comma
+                          'curDgt ' # Currency digits
+                          'curRnd ' # Currencty rounding (unused: QTBUG-81343)
+                          'dow1st ' # First day of week
+                          ' wknd+ ' # Week-end start/end days
+                          ' wknd-'
+                          # No trailing space on last entry (be sure to
+                          # pad before adding anything after it).
+                          '\n')
+
+        formatLine = ''.join((
+            '    {{ ',
+            # Locale-identifier
+            '{:6d},' * 3,
+            # List patterns, date/time formats, day names, am/pm
+            # SI/IEC byte-unit abbreviations
+            # Currency and endonyms
+            # Range starts
+            '{:5d},' * 37,
+            # Range sizes
+            '{:3d},' * 37,
+
+            # Currency ISO code
+            ' {:>10s}, ',
+            # Currency formatting
+            '{:6d},{:6d}',
+            # Day of week and week-end
+            ',{:6d}' * 3,
+            ' }}')).format
+        for key in names:
+            locale = locales[key]
+            # Sequence of StringDataToken:
+            ranges = (tuple(list_pattern_part_data.append(p) for p in # 5 entries:
+                            (locale.listPatternPartStart, locale.listPatternPartMiddle,
+                             locale.listPatternPartEnd, locale.listPatternPartTwo,
+                             locale.listDelim)) +
+                      tuple(single_character_data.append(p) for p in # 11 entries
+                            (locale.decimal, locale.group, locale.percent, locale.zero,
+                             locale.minus, locale.plus, locale.exp,
+                             locale.quotationStart, locale.quotationEnd,
+                             locale.alternateQuotationStart, locale.alternateQuotationEnd)) +
+                      tuple (date_format_data.append(f) for f in # 2 entries:
+                             (locale.longDateFormat, locale.shortDateFormat)) +
+                      tuple(time_format_data.append(f) for f in # 2 entries:
+                            (locale.longTimeFormat, locale.shortTimeFormat)) +
+                      tuple(days_data.append(d) for d in # 6 entries:
+                            (locale.standaloneLongDays, locale.longDays,
+                             locale.standaloneShortDays, locale.shortDays,
+                             locale.standaloneNarrowDays, locale.narrowDays)) +
+                      (am_data.append(locale.am), pm_data.append(locale.pm)) + # 2 entries
+                      tuple(byte_unit_data.append(b) for b in # 3 entries:
+                            (locale.byte_unit,
+                             locale.byte_si_quantified,
+                             locale.byte_iec_quantified)) +
+                      (currency_symbol_data.append(locale.currencySymbol),
+                       currency_display_name_data.append(locale.currencyDisplayName),
+                       currency_format_data.append(locale.currencyFormat),
+                       currency_format_data.append(locale.currencyNegativeFormat),
+                       endonyms_data.append(locale.languageEndonym),
+                       endonyms_data.append(locale.countryEndonym)) # 6 entries
+                      ) # Total: 37 entries
+            assert len(ranges) == 37
+
+            self.writer.write(formatLine(*(
+                        key +
+                        tuple(r.index for r in ranges) +
+                        tuple(r.length for r in ranges) +
+                        (currencyIsoCodeData(locale.currencyIsoCode),
+                         locale.currencyDigits,
+                         locale.currencyRounding, # unused (QTBUG-81343)
+                         locale.firstDayOfWeek,
+                         locale.weekendStart,
+                         locale.weekendEnd) ))
+                              + ', // {}/{}/{}\n'.format(
+                    locale.language, locale.script, locale.country))
+        self.writer.write(formatLine(*( # All zeros, matching the format:
+                    (0,) * 3 + (0,) * 37 * 2
+                    + (currencyIsoCodeData(0),)
+                    + (0,) * 2
+                    + (0,) * 3 ))
+                          + ' // trailing zeros\n')
+        self.writer.write('};\n')
+
+        # StringData tables:
+        for data in (list_pattern_part_data, single_character_data,
+                     date_format_data, time_format_data, days_data,
+                     byte_unit_data, am_data, pm_data, currency_symbol_data,
+                     currency_display_name_data, currency_format_data,
+                     endonyms_data):
+            data.write(self.writer)
+
+    @staticmethod
+    def __writeNameData(out, book, form):
+        out('static const char {}_name_list[] =\n'.format(form))
+        out('"Default\\0"\n')
+        for key, value in book.items():
+            if key == 0:
+                continue
+            out('"' + value[0] + '\\0"\n')
+        out(';\n\n')
+
+        out('static const quint16 {}_name_index[] = {{\n'.format(form))
+        out('     0, // Any{}\n'.format(form.capitalize()))
+        index = 8
+        for key, value in book.items():
+            if key == 0:
+                continue
+            name = value[0]
+            out('{:6d}, // {}\n'.format(index, name))
+            index += len(name) + 1
+        out('};\n\n')
+
+    @staticmethod
+    def __writeCodeList(out, book, form, width):
+        out('static const unsigned char {}_code_list[] =\n'.format(form))
+        for key, value in book.items():
+            code = value[1]
+            code += r'\0' * max(width - len(code), 0)
+            out('"{}" // {}\n'.format(code, value[0]))
+        out(';\n\n')
+
+    def languageNames(self, languages):
+        self.__writeNameData(self.writer.write, languages, 'language')
+
+    def scriptNames(self, scripts):
+        self.__writeNameData(self.writer.write, scripts, 'script')
+
+    def countryNames(self, countries):
+        self.__writeNameData(self.writer.write, countries, 'country')
+
+    # TODO: unify these next three into the previous three; kept
+    # separate for now to verify we're not changing data.
+
+    def languageCodes(self, languages):
+        self.__writeCodeList(self.writer.write, languages, 'language', 3)
+
+    def scriptCodes(self, scripts):
+        self.__writeCodeList(self.writer.write, scripts, 'script', 4)
+
+    def countryCodes(self, countries): # TODO: unify with countryNames()
+        self.__writeCodeList(self.writer.write, countries, 'country', 3)
+
+class CalendarDataWriter (LocaleSourceEditor):
+    formatCalendar = (
+        '      {{'
+        + ','.join(('{:6d}',) * 3 + ('{:5d}',) * 6 + ('{:3d}',) * 6)
+        + ' }},').format
+    def write(self, calendar, locales, names):
+        months_data = StringData('months_data')
 
-def main():
-    if len(sys.argv) != 3:
-        usage()
+        self.writer.write('static const QCalendarLocale locale_data[] = {\n')
+        self.writer.write(
+            '     //'
+            # IDs, width 7 (6 + comma)
+            ' lang  '
+            ' script'
+            ' terr  '
+            # Month-name start-indices, width 6 (5 + comma)
+            'sLong '
+            ' long '
+            'sShrt '
+            'short '
+            'sNarw '
+            'narow '
+            #  No individual headers for the sizes.
+            'Sizes...'
+            '\n')
+        for key in names:
+            locale = locales[key]
+            # Sequence of StringDataToken:
+            try:
+                # Twelve long month names can add up to more than 256 (e.g. kde_TZ: 264)
+                ranges = (tuple(months_data.append(m[calendar], 16) for m in
+                                (locale.standaloneLongMonths, locale.longMonths)) +
+                          tuple(months_data.append(m[calendar]) for m in
+                                (locale.standaloneShortMonths, locale.shortMonths,
+                                 locale.standaloneNarrowMonths, locale.narrowMonths)))
+            except ValueError as e:
+                e.args += (locale.language, locale.script, locale.country, stem)
+                raise
 
-    qlocalexml = sys.argv[1]
-    qtsrcdir = sys.argv[2]
+            self.writer.write(
+                self.formatCalendar(*(
+                        key +
+                        tuple(r.index for r in ranges) +
+                        tuple(r.length for r in ranges) ))
+                + '// {}/{}/{}\n'.format(locale.language, locale.script, locale.country))
+        self.writer.write(self.formatCalendar(*( (0,) * (3 + 6 * 2) ))
+                          + '// trailing zeros\n')
+        self.writer.write('};\n')
+        months_data.write(self.writer)
+
+class LocaleHeaderWriter (SourceFileEditor):
+    __upinit = SourceFileEditor.__init__
+    def __init__(self, path, temp, dupes):
+        self.__upinit(path, temp)
+        self.__dupes = dupes
+
+    def languages(self, languages):
+        self.__enum('Language', languages, self.__language)
+        self.writer.write('\n')
+
+    def countries(self, countries):
+        self.__enum('Country', countries, self.__country)
+
+    def scripts(self, scripts):
+        self.__enum('Script', scripts, self.__script)
+        self.writer.write('\n')
+
+    # Implementation details
+    from enumdata import (language_aliases as __language,
+                          country_aliases as __country,
+                          script_aliases as __script)
+
+    def __enum(self, name, book, alias):
+        assert book
+        out, dupes = self.writer.write, self.__dupes
+        out('    enum {} {{\n'.format(name))
+        for key, value in book.items():
+            member = value[0]
+            if name == 'Script':
+                # Don't .capitalize() as some names are already camel-case (see enumdata.py):
+                member = ''.join(word[0].upper() + word[1:] for word in member.split())
+                if not member.endswith('Script'):
+                    member += 'Script'
+                if member in dupes:
+                    raise Error('The script name "{}" is messy'.format(member))
+            else:
+                member = ''.join(member.split())
+                member = member + name if member in dupes else member
+            out('        {} = {},\n'.format(member, key))
+
+        out('\n        '
+            + ',\n        '.join('{} = {}'.format(*pair)
+                                 for pair in sorted(alias.items()))
+            + ',\n\n        Last{} = {}\n    }};\n'.format(name, member))
+
+def usage(name, err, message = ''):
+    err.write("""Usage: {} path/to/qlocale.xml root/of/qtbase
+""".format(name)) # TODO: elaborate
+    if message:
+        err.write('\n' + message + '\n')
+
+def main(args, out, err):
+    # TODO: Make calendars a command-line parameter
+    # map { CLDR name: Qt file name }
+    calendars = {'gregorian': 'roman', 'persian': 'jalali', 'islamic': 'hijri',} # 'hebrew': 'hebrew',
+
+    name = args.pop(0)
+    if len(args) != 2:
+        usage(name, err, 'I expect two arguments')
+        return 1
+
+    qlocalexml = args.pop(0)
+    qtsrcdir = args.pop(0)
 
     if not (os.path.isdir(qtsrcdir)
             and all(os.path.isfile(os.path.join(qtsrcdir, 'src', 'corelib', 'text', leaf))
                     for leaf in ('qlocale_data_p.h', 'qlocale.h', 'qlocale.qdoc'))):
-        usage()
-
-    (data_temp_file, data_temp_file_path) = tempfile.mkstemp("qlocale_data_p.h", dir=qtsrcdir)
-    data_temp_file = os.fdopen(data_temp_file, "w")
-    qlocaledata_file = open(qtsrcdir + "/src/corelib/text/qlocale_data_p.h", "r")
-    s = qlocaledata_file.readline()
-    while s and s != GENERATED_BLOCK_START:
-        data_temp_file.write(s)
-        s = qlocaledata_file.readline()
-    data_temp_file.write(GENERATED_BLOCK_START)
-
-    doc = xml.dom.minidom.parse(qlocalexml)
-    language_map = loadMap(doc, 'language')
-    script_map = loadMap(doc, 'script')
-    country_map = loadMap(doc, 'country')
-    likely_subtags_map = loadLikelySubtagsMap(doc)
-    default_map = {}
-    for key in likely_subtags_map.keys():
-        tmp = likely_subtags_map[key]
-        if tmp["from"][1] == "AnyScript" and tmp["from"][2] == "AnyCountry" and tmp["to"][2] != "AnyCountry":
-            default_map[(tmp["to"][0], tmp["to"][1])] = tmp["to"][2]
-    locale_map = loadLocaleMap(doc, language_map, script_map, country_map, likely_subtags_map)
-    dupes = findDupes(language_map, country_map)
-
-    cldr_version = firstChildText(doc.documentElement, "version")
-    data_temp_file.write(generated_template % (datetime.date.today(), cldr_version))
-
-    # Likely subtags map
-    data_temp_file.write("static const QLocaleId likely_subtags[] = {\n")
-    index = 0
-    for key in likely_subtags_map.keys():
-        tmp = likely_subtags_map[key]
-        from_language = languageNameToId(tmp["from"][0], language_map)
-        from_script = scriptNameToId(tmp["from"][1], script_map)
-        from_country = countryNameToId(tmp["from"][2], country_map)
-        to_language = languageNameToId(tmp["to"][0], language_map)
-        to_script = scriptNameToId(tmp["to"][1], script_map)
-        to_country = countryNameToId(tmp["to"][2], country_map)
-
-        cmnt_from = ""
-        if from_language != 0:
-            cmnt_from = cmnt_from + language_map[from_language][1]
-        else:
-            cmnt_from = cmnt_from + "und"
-        if from_script != 0:
-            if cmnt_from:
-                cmnt_from = cmnt_from + "_"
-            cmnt_from = cmnt_from + script_map[from_script][1]
-        if from_country != 0:
-            if cmnt_from:
-                cmnt_from = cmnt_from + "_"
-            cmnt_from = cmnt_from + country_map[from_country][1]
-        cmnt_to = ""
-        if to_language != 0:
-            cmnt_to = cmnt_to + language_map[to_language][1]
-        else:
-            cmnt_to = cmnt_to + "und"
-        if to_script != 0:
-            if cmnt_to:
-                cmnt_to = cmnt_to + "_"
-            cmnt_to = cmnt_to + script_map[to_script][1]
-        if to_country != 0:
-            if cmnt_to:
-                cmnt_to = cmnt_to + "_"
-            cmnt_to = cmnt_to + country_map[to_country][1]
-
-        data_temp_file.write("    ")
-        data_temp_file.write("{ %3d, %3d, %3d }, { %3d, %3d, %3d }" %
-                             (from_language, from_script, from_country, to_language, to_script, to_country))
-        index += 1
-        if index != len(likely_subtags_map):
-            data_temp_file.write(",")
-        else:
-            data_temp_file.write(" ")
-        data_temp_file.write(" // %s -> %s\n" % (cmnt_from, cmnt_to))
-    data_temp_file.write("};\n")
-
-    data_temp_file.write("\n")
-
-    # Locale index
-    data_temp_file.write("static const quint16 locale_index[] = {\n")
-    index = 0
-    for key in language_map.keys():
-        i = 0
-        count = languageCount(key, locale_map)
-        if count > 0:
-            i = index
-            index += count
-        data_temp_file.write("%6d, // %s\n" % (i, language_map[key][0]))
-    data_temp_file.write("     0 // trailing 0\n")
-    data_temp_file.write("};\n\n")
-
-    list_pattern_part_data = StringData('list_pattern_part_data')
-    single_character_data = StringData('single_character_data')
-    date_format_data = StringData('date_format_data')
-    time_format_data = StringData('time_format_data')
-    days_data = StringData('days_data')
-    am_data = StringData('am_data')
-    pm_data = StringData('pm_data')
-    byte_unit_data = StringData('byte_unit_data')
-    currency_symbol_data = StringData('currency_symbol_data')
-    currency_display_name_data = StringData('currency_display_name_data')
-    currency_format_data = StringData('currency_format_data')
-    endonyms_data = StringData('endonyms_data')
-
-    # Locale data
-    data_temp_file.write("static const QLocaleData locale_data[] = {\n")
-    # Table headings: keep each label centred in its field, matching line_format:
-    data_temp_file.write('   // '
-                         # Width 6 + comma:
-                         + ' lang  ' # IDs
-                         + 'script '
-                         + '  terr '
-
-                         # Range entries (all start-indices, then all sizes):
-                         # Width 5 + comma:
-                         + 'lStrt ' # List pattern
-                         + 'lpMid '
-                         + 'lpEnd '
-                         + 'lPair '
-                         + 'lDelm ' # List delimiter
-                         # Representing numbers:
-                         + ' dec  '
-                         + 'group '
-                         + 'prcnt '
-                         + ' zero '
-                         + 'minus '
-                         + 'plus  '
-                         + ' exp  '
-                         # Quotation marks
-                         + 'qtOpn '
-                         + 'qtEnd '
-                         + 'altQO '
-                         + 'altQE '
-                         + 'lDFmt ' # Date format
-                         + 'sDFmt '
-                         + 'lTFmt ' # Time format
-                         + 'sTFmt '
-                         + 'slDay ' # Day names
-                         + 'lDays '
-                         + 'ssDys '
-                         + 'sDays '
-                         + 'snDay '
-                         + 'nDays '
-                         + '  am  ' # am/pm indicators
-                         + '  pm  '
-                         + ' byte '
-                         + 'siQnt '
-                         + 'iecQn '
-                         + 'crSym ' # Currency formatting:
-                         + 'crDsp '
-                         + 'crFmt '
-                         + 'crFNg '
-                         + 'ntLng ' # Name of language in itself, and of territory:
-                         + 'ntTer '
-                         # Width 3 + comma for each size; no header
-                         + '    ' * 37
-
-                         # Strays (char array, bit-fields):
-                         # Width 8+4 + comma
-                         + '   currISO   '
-                         # Width 6 + comma:
-                         + 'curDgt ' # Currency digits
-                         + 'curRnd ' # Currencty rounding (unused: QTBUG-81343)
-                         + 'dow1st ' # First day of week
-                         + ' wknd+ ' # Week-end start/end days:
-                         + ' wknd-'
-                         # No trailing space on last entry (be sure to
-                         # pad before adding anything after it).
-                         + '\n')
+        usage(name, err, 'Missing expected files under qtbase source root ' + qtsrcdir)
+        return 1
+
+    reader = QLocaleXmlReader(qlocalexml)
+    locale_map = dict(reader.loadLocaleMap(calendars, err.write))
 
     locale_keys = locale_map.keys()
-    compareLocaleKeys.default_map = default_map
-    compareLocaleKeys.locale_map = locale_map
+    compareLocaleKeys.default_map = dict(reader.defaultMap())
     locale_keys.sort(compareLocaleKeys)
 
-    line_format = ('    { '
-                   # Locale-identifier:
-                   + '%6d,' * 3
-                   # Offsets for starts of ranges:
-                   + '%5d,' * 37
-                   # Sizes for the same:
-                   + '%3d,' * 37
-
-                   # Currency ISO code:
-                   + ' %10s, '
-                   # Currency formatting:
-                   + '%6d,%6d'
-                   # Day of week and week-end:
-                   + ',%6d' * 3
-                   + ' }')
-    for key in locale_keys:
-        l = locale_map[key]
-        # Sequence of StringDataToken:
-        ranges = (tuple(list_pattern_part_data.append(p) for p in # 5 entries:
-                        (l.listPatternPartStart, l.listPatternPartMiddle,
-                         l.listPatternPartEnd, l.listPatternPartTwo, l.listDelim)) +
-                  tuple(single_character_data.append(p) for p in # 11 entries
-                        (l.decimal, l.group, l.percent, l.zero, l.minus, l.plus, l.exp,
-                         l.quotationStart, l.quotationEnd,
-                         l.alternateQuotationStart, l.alternateQuotationEnd)) +
-                  tuple (date_format_data.append(f) for f in # 2 entries:
-                         (l.longDateFormat, l.shortDateFormat)) +
-                  tuple(time_format_data.append(f) for f in # 2 entries:
-                        (l.longTimeFormat, l.shortTimeFormat)) +
-                  tuple(days_data.append(d) for d in # 6 entries:
-                        (l.standaloneLongDays, l.longDays,
-                         l.standaloneShortDays, l.shortDays,
-                         l.standaloneNarrowDays, l.narrowDays)) +
-                  (am_data.append(l.am), pm_data.append(l.pm)) + # 2 entries:
-                  tuple(byte_unit_data.append(b) for b in # 3 entries:
-                        (l.byte_unit, l.byte_si_quantified, l.byte_iec_quantified)) +
-                  (currency_symbol_data.append(l.currencySymbol),
-                   currency_display_name_data.append(l.currencyDisplayName),
-                   currency_format_data.append(l.currencyFormat),
-                   currency_format_data.append(l.currencyNegativeFormat),
-                   endonyms_data.append(l.languageEndonym),
-                   endonyms_data.append(l.countryEndonym)) # 6 entries
-                  ) # Total: 37 entries
-        assert len(ranges) == 37
-
-        data_temp_file.write(line_format
-                    % ((key[0], key[1], key[2]) +
-                       tuple(r.index for r in ranges) +
-                       tuple(r.length for r in ranges) +
-                       (currencyIsoCodeData(l.currencyIsoCode),
-                        l.currencyDigits,
-                        l.currencyRounding, # unused (QTBUG-81343)
-                        l.firstDayOfWeek,
-                        l.weekendStart,
-                        l.weekendEnd))
-                             + ", // %s/%s/%s\n" % (l.language, l.script, l.country))
-    data_temp_file.write(line_format # All zeros, matching the format:
-                         % ( (0,) * 3 + (0,) * 37 * 2
-                             + (currencyIsoCodeData(0),)
-                             + (0,) * 2
-                             + (0,) * 3)
-                         + " // trailing zeros\n")
-    data_temp_file.write("};\n")
-
-    # StringData tables:
-    for data in (list_pattern_part_data, single_character_data,
-                 date_format_data, time_format_data, days_data,
-                 byte_unit_data, am_data, pm_data, currency_symbol_data,
-                 currency_display_name_data, currency_format_data,
-                 endonyms_data):
-        data.write(data_temp_file)
-
-    data_temp_file.write("\n")
-
-    # Language name list
-    data_temp_file.write("static const char language_name_list[] =\n")
-    data_temp_file.write('"Default\\0"\n')
-    for key in language_map.keys():
-        if key == 0:
-            continue
-        data_temp_file.write('"' + language_map[key][0] + '\\0"\n')
-    data_temp_file.write(";\n")
-
-    data_temp_file.write("\n")
-
-    # Language name index
-    data_temp_file.write("static const quint16 language_name_index[] = {\n")
-    data_temp_file.write("     0, // AnyLanguage\n")
-    index = 8
-    for key in language_map.keys():
-        if key == 0:
-            continue
-        language = language_map[key][0]
-        data_temp_file.write("%6d, // %s\n" % (index, language))
-        index += len(language) + 1
-    data_temp_file.write("};\n")
-
-    data_temp_file.write("\n")
-
-    # Script name list
-    data_temp_file.write("static const char script_name_list[] =\n")
-    data_temp_file.write('"Default\\0"\n')
-    for key in script_map.keys():
-        if key == 0:
-            continue
-        data_temp_file.write('"' + script_map[key][0] + '\\0"\n')
-    data_temp_file.write(";\n")
-
-    data_temp_file.write("\n")
-
-    # Script name index
-    data_temp_file.write("static const quint16 script_name_index[] = {\n")
-    data_temp_file.write("     0, // AnyScript\n")
-    index = 8
-    for key in script_map.keys():
-        if key == 0:
-            continue
-        script = script_map[key][0]
-        data_temp_file.write("%6d, // %s\n" % (index, script))
-        index += len(script) + 1
-    data_temp_file.write("};\n")
-
-    data_temp_file.write("\n")
-
-    # Country name list
-    data_temp_file.write("static const char country_name_list[] =\n")
-    data_temp_file.write('"Default\\0"\n')
-    for key in country_map.keys():
-        if key == 0:
-            continue
-        data_temp_file.write('"' + country_map[key][0] + '\\0"\n')
-    data_temp_file.write(";\n")
-
-    data_temp_file.write("\n")
-
-    # Country name index
-    data_temp_file.write("static const quint16 country_name_index[] = {\n")
-    data_temp_file.write("     0, // AnyCountry\n")
-    index = 8
-    for key in country_map.keys():
-        if key == 0:
-            continue
-        country = country_map[key][0]
-        data_temp_file.write("%6d, // %s\n" % (index, country))
-        index += len(country) + 1
-    data_temp_file.write("};\n")
-
-    data_temp_file.write("\n")
-
-    # Language code list
-    data_temp_file.write("static const unsigned char language_code_list[] =\n")
-    for key in language_map.keys():
-        code = language_map[key][1]
-        if len(code) == 2:
-            code += r"\0"
-        data_temp_file.write('"%2s" // %s\n' % (code, language_map[key][0]))
-    data_temp_file.write(";\n")
-
-    data_temp_file.write("\n")
-
-    # Script code list
-    data_temp_file.write("static const unsigned char script_code_list[] =\n")
-    for key in script_map.keys():
-        code = script_map[key][1]
-        for i in range(4 - len(code)):
-            code += "\\0"
-        data_temp_file.write('"%2s" // %s\n' % (code, script_map[key][0]))
-    data_temp_file.write(";\n")
-
-    # Country code list
-    data_temp_file.write("static const unsigned char country_code_list[] =\n")
-    for key in country_map.keys():
-        code = country_map[key][1]
-        if len(code) == 2:
-            code += "\\0"
-        data_temp_file.write('"%2s" // %s\n' % (code, country_map[key][0]))
-    data_temp_file.write(";\n")
-
-    data_temp_file.write("\n")
-    data_temp_file.write(GENERATED_BLOCK_END)
-    s = qlocaledata_file.readline()
-    # skip until end of the old block
-    while s and s != GENERATED_BLOCK_END:
-        s = qlocaledata_file.readline()
-
-    s = qlocaledata_file.readline()
-    while s:
-        data_temp_file.write(s)
-        s = qlocaledata_file.readline()
-    data_temp_file.close()
-    qlocaledata_file.close()
-
-    os.remove(qtsrcdir + "/src/corelib/text/qlocale_data_p.h")
-    os.rename(data_temp_file_path, qtsrcdir + "/src/corelib/text/qlocale_data_p.h")
+    try:
+        writer = LocaleDataWriter(os.path.join(qtsrcdir,  'src', 'corelib', 'text',
+                                               'qlocale_data_p.h'),
+                                  qtsrcdir, reader.cldrVersion)
+    except IOError as e:
+        err.write('Failed to open files to transcribe locale data: ' + (e.message or e.args[1]))
+        return 1
+
+    try:
+        writer.likelySubtags(reader.likelyMap())
+        writer.localeIndex(reader.languageIndices(tuple(k[0] for k in locale_map)))
+        writer.localeData(locale_map, locale_keys)
+        writer.writer.write('\n')
+        writer.languageNames(reader.languages)
+        writer.scriptNames(reader.scripts)
+        writer.countryNames(reader.countries)
+        # TODO: merge the next three into the previous three
+        writer.languageCodes(reader.languages)
+        writer.scriptCodes(reader.scripts)
+        writer.countryCodes(reader.countries)
+    except Error as e:
+        writer.cleanup()
+        err.write('\nError updating locale data: ' + e.message + '\n')
+        return 1
+
+    writer.close()
 
     # Generate calendar data
-    calendar_format = '      {%6d,%6d,%6d' + ',%5d' * 6 + ',%3d' * 6 + ' },'
     for calendar, stem in calendars.items():
-        months_data = StringData('months_data')
-        calendar_data_file = "q%scalendar_data_p.h" % stem
-        calendar_template_file = open(os.path.join(qtsrcdir, 'src', 'corelib', 'time',
-                                                   calendar_data_file), "r")
-        (calendar_temp_file, calendar_temp_file_path) = tempfile.mkstemp(calendar_data_file, dir=qtsrcdir)
-        calendar_temp_file = os.fdopen(calendar_temp_file, "w")
-        s = calendar_template_file.readline()
-        while s and s != GENERATED_BLOCK_START:
-            calendar_temp_file.write(s)
-            s = calendar_template_file.readline()
-        calendar_temp_file.write(GENERATED_BLOCK_START)
-        calendar_temp_file.write(generated_template % (datetime.date.today(), cldr_version))
-        calendar_temp_file.write("static const QCalendarLocale locale_data[] = {\n")
-        calendar_temp_file.write('   // '
-                                 # IDs, width 7 (6 + comma)
-                                 + ' lang  '
-                                 + ' script'
-                                 + ' terr  '
-                                 # Month-name start-indices, width 6 (5 + comma):
-                                 + 'sLng '
-                                 + 'long '
-                                 + 'sSrt '
-                                 + 'shrt '
-                                 + 'sNrw '
-                                 + 'naro '
-                                 # No individual headers for the sizes.
-                                 + 'Sizes...'
-                                 + '\n')
-        for key in locale_keys:
-            l = locale_map[key]
-            # Sequence of StringDataToken:
-            try:
-                # Twelve long month names can add up to more than 256 (e.g. kde_TZ: 264)
-                ranges = (tuple(months_data.append(m[calendar], 16) for m in
-                                (l.standaloneLongMonths, l.longMonths)) +
-                          tuple(months_data.append(m[calendar]) for m in
-                                (l.standaloneShortMonths, l.shortMonths,
-                                 l.standaloneNarrowMonths, l.narrowMonths)))
-            except ValueError as e:
-                e.args += (l.language, l.script, l.country, stem)
-                raise
+        try:
+            writer = CalendarDataWriter(os.path.join(qtsrcdir, 'src', 'corelib', 'time',
+                                                     'q{}calendar_data_p.h'.format(stem)),
+                                        qtsrcdir, reader.cldrVersion)
+        except IOError as e:
+            err.write('Failed to open files to transcribe ' + calendar
+                             + ' data ' + (e.message or e.args[1]))
+            return 1
+
+        try:
+            writer.write(calendar, locale_map, locale_keys)
+        except Error as e:
+            writer.cleanup()
+            err.write('\nError updating ' + calendar + ' locale data: ' + e.message + '\n')
+            return 1
 
-            calendar_temp_file.write(
-                calendar_format
-                % ((key[0], key[1], key[2]) +
-                   tuple(r.index for r in ranges) +
-                   tuple(r.length for r in ranges))
-                + "// %s/%s/%s\n" % (l.language, l.script, l.country))
-        calendar_temp_file.write(calendar_format % ( (0,) * (3 + 6 * 2) )
-                                 + '// trailing zeros\n')
-        calendar_temp_file.write("};\n")
-        months_data.write(calendar_temp_file)
-        s = calendar_template_file.readline()
-        while s and s != GENERATED_BLOCK_END:
-            s = calendar_template_file.readline()
-        while s:
-            calendar_temp_file.write(s)
-            s = calendar_template_file.readline()
-        os.rename(calendar_temp_file_path,
-                  os.path.join(qtsrcdir, 'src', 'corelib', 'time', calendar_data_file))
+        writer.close()
 
     # qlocale.h
+    try:
+        writer = LocaleHeaderWriter(os.path.join(qtsrcdir, 'src', 'corelib', 'text', 'qlocale.h'),
+                                    qtsrcdir, reader.dupes)
+    except IOError as e:
+        err.write('Failed to open files to transcribe qlocale.h: ' + (e.message or e.args[1]))
+        return 1
 
-    (qlocaleh_temp_file, qlocaleh_temp_file_path) = tempfile.mkstemp("qlocale.h", dir=qtsrcdir)
-    qlocaleh_temp_file = os.fdopen(qlocaleh_temp_file, "w")
-    qlocaleh_file = open(qtsrcdir + "/src/corelib/text/qlocale.h", "r")
-    s = qlocaleh_file.readline()
-    while s and s != GENERATED_BLOCK_START:
-        qlocaleh_temp_file.write(s)
-        s = qlocaleh_file.readline()
-    qlocaleh_temp_file.write(GENERATED_BLOCK_START)
-    qlocaleh_temp_file.write("// see qlocale_data_p.h for more info on generated data\n")
-
-    # Language enum
-    qlocaleh_temp_file.write("    enum Language {\n")
-    language = None
-    for key, value in language_map.items():
-        language = fixedLanguageName(value[0], dupes)
-        qlocaleh_temp_file.write("        " + language + " = " + str(key) + ",\n")
-
-    qlocaleh_temp_file.write("\n        " +
-                             ",\n        ".join('%s = %s' % pair
-                                                for pair in sorted(language_aliases.items())) +
-                             ",\n")
-    qlocaleh_temp_file.write("\n")
-    qlocaleh_temp_file.write("        LastLanguage = " + language + "\n")
-    qlocaleh_temp_file.write("    };\n\n")
-
-    # Script enum
-    qlocaleh_temp_file.write("    enum Script {\n")
-    script = None
-    for key, value in script_map.items():
-        script = fixedScriptName(value[0], dupes)
-        qlocaleh_temp_file.write("        " + script + " = " + str(key) + ",\n")
-    qlocaleh_temp_file.write("\n        " +
-                             ",\n        ".join('%s = %s' % pair
-                                                for pair in sorted(script_aliases.items())) +
-                             ",\n")
-    qlocaleh_temp_file.write("\n")
-    qlocaleh_temp_file.write("        LastScript = " + script + "\n")
-    qlocaleh_temp_file.write("    };\n\n")
-
-    # Country enum
-    qlocaleh_temp_file.write("    enum Country {\n")
-    country = None
-    for key, value in country_map.items():
-        country = fixedCountryName(value[0], dupes)
-        qlocaleh_temp_file.write("        " + country + " = " + str(key) + ",\n")
-    qlocaleh_temp_file.write("\n        " +
-                             ",\n        ".join('%s = %s' % pair
-                                                for pair in sorted(country_aliases.items())) +
-                             ",\n")
-    qlocaleh_temp_file.write("\n")
-    qlocaleh_temp_file.write("        LastCountry = " + country + "\n")
-    qlocaleh_temp_file.write("    };\n")
-
-    qlocaleh_temp_file.write(GENERATED_BLOCK_END)
-    s = qlocaleh_file.readline()
-    # skip until end of the old block
-    while s and s != GENERATED_BLOCK_END:
-        s = qlocaleh_file.readline()
-
-    s = qlocaleh_file.readline()
-    while s:
-        qlocaleh_temp_file.write(s)
-        s = qlocaleh_file.readline()
-    qlocaleh_temp_file.close()
-    qlocaleh_file.close()
-
-    os.remove(qtsrcdir + "/src/corelib/text/qlocale.h")
-    os.rename(qlocaleh_temp_file_path, qtsrcdir + "/src/corelib/text/qlocale.h")
+    try:
+        writer.languages(reader.languages)
+        writer.scripts(reader.scripts)
+        writer.countries(reader.countries)
+    except Error as e:
+        writer.cleanup()
+        err.write('\nError updating qlocale.h: ' + e.message + '\n')
+        return 1
+
+    writer.close()
 
     # qlocale.qdoc
+    try:
+        writer = Transcriber(os.path.join(qtsrcdir, 'src', 'corelib', 'text', 'qlocale.qdoc'),
+                             qtsrcdir)
+    except IOError as e:
+        err.write('Failed to open files to transcribe qlocale.qdoc: ' + (e.message or e.args[1]))
+        return 1
 
-    (qlocaleqdoc_temp_file, qlocaleqdoc_temp_file_path) = tempfile.mkstemp("qlocale.qdoc", dir=qtsrcdir)
-    qlocaleqdoc_temp_file = os.fdopen(qlocaleqdoc_temp_file, "w")
-    qlocaleqdoc_file = open(qtsrcdir + "/src/corelib/text/qlocale.qdoc", "r")
-    s = qlocaleqdoc_file.readline()
     DOCSTRING = "    QLocale's data is based on Common Locale Data Repository "
-    while s:
-        if DOCSTRING in s:
-            qlocaleqdoc_temp_file.write(DOCSTRING + "v" + cldr_version + ".\n")
-        else:
-            qlocaleqdoc_temp_file.write(s)
-        s = qlocaleqdoc_file.readline()
-    qlocaleqdoc_temp_file.close()
-    qlocaleqdoc_file.close()
-
-    os.remove(qtsrcdir + "/src/corelib/text/qlocale.qdoc")
-    os.rename(qlocaleqdoc_temp_file_path, qtsrcdir + "/src/corelib/text/qlocale.qdoc")
+    try:
+        for line in writer.reader:
+            if DOCSTRING in line:
+                writer.writer.write(DOCSTRING + 'v' + reader.cldrVersion + '.\n')
+            else:
+                writer.writer.write(line)
+    except Error as e:
+        writer.cleanup()
+        err.write('\nError updating qlocale.qdoc: ' + e.message + '\n')
+        return 1
+
+    writer.close()
+    return 0
 
 if __name__ == "__main__":
-    main()
+    import sys
+    sys.exit(main(sys.argv, sys.stdout, sys.stderr))
diff --git a/util/locale_database/xpathlite.py b/util/locale_database/xpathlite.py
deleted file mode 100644
index 97efaaab41..0000000000
--- a/util/locale_database/xpathlite.py
+++ /dev/null
@@ -1,288 +0,0 @@
-#!/usr/bin/env python
-#############################################################################
-##
-## Copyright (C) 2016 The Qt Company Ltd.
-## Contact: https://www.qt.io/licensing/
-##
-## This file is part of the test suite of the Qt Toolkit.
-##
-## $QT_BEGIN_LICENSE:GPL-EXCEPT$
-## Commercial License Usage
-## Licensees holding valid commercial Qt licenses may use this file in
-## accordance with the commercial license agreement provided with the
-## Software or, alternatively, in accordance with the terms contained in
-## a written agreement between you and The Qt Company. For licensing terms
-## and conditions see https://www.qt.io/terms-conditions. For further
-## information use the contact form at https://www.qt.io/contact-us.
-##
-## GNU General Public License Usage
-## Alternatively, this file may be used under the terms of the GNU
-## General Public License version 3 as published by the Free Software
-## Foundation with exceptions as appearing in the file LICENSE.GPL3-EXCEPT
-## included in the packaging of this file. Please review the following
-## information to ensure the GNU General Public License requirements will
-## be met: https://www.gnu.org/licenses/gpl-3.0.html.
-##
-## $QT_END_LICENSE$
-##
-#############################################################################
-
-import sys
-import os
-import xml.dom.minidom
-
-class DraftResolution:
-    # See http://www.unicode.org/cldr/process.html for description
-    unconfirmed = 'unconfirmed'
-    provisional = 'provisional'
-    contributed = 'contributed'
-    approved = 'approved'
-    _values = { unconfirmed : 1, provisional : 2, contributed : 3, approved : 4 }
-    def __init__(self, resolution):
-        self.resolution = resolution
-    def toInt(self):
-        return DraftResolution._values[self.resolution]
-
-class Error:
-    def __init__(self, msg):
-        self.msg = msg
-    def __str__(self):
-        return self.msg
-
-doc_cache = {}
-def parseDoc(file):
-    if not doc_cache.has_key(file):
-        doc_cache[file] = xml.dom.minidom.parse(file)
-    return doc_cache[file]
-
-def findChild(parent, tag_name, arg_name=None, arg_value=None, draft=None):
-    for node in parent.childNodes:
-        if node.nodeType != node.ELEMENT_NODE:
-            continue
-        if node.nodeName != tag_name:
-            continue
-        if arg_value:
-            if not node.attributes.has_key(arg_name):
-                continue
-            if node.attributes[arg_name].nodeValue != arg_value:
-                continue
-        if draft:
-            if not node.attributes.has_key('draft'):
-                # if draft is not specified then it's approved
-                return node
-            value = node.attributes['draft'].nodeValue
-            value = DraftResolution(value).toInt()
-            exemplar = DraftResolution(draft).toInt()
-            if exemplar > value:
-                continue
-        return node
-    return False
-
-def codeMapsFromFile(file):
-    """Extract mappings of language, script and country codes to names.
-
-    The file shall typically be common/main/en.xml, which contains a
-    localeDisplayNames element with children languages, scripts and
-    territories; each element in each of these has a code as its type
-    attribute and its name as element content.  This returns a mapping
-    withe keys 'language', 'script' and 'country', each of which
-    has, as value, a mapping of the relevant codes to names.
-    """
-    parent = findChild(findChild(parseDoc(file), 'ldml'), 'localeDisplayNames')
-    keys, result = {'languages': 'language', 'scripts': 'script', 'territories': 'country'}, {}
-    for src, dst in keys.items():
-        child = findChild(parent, src)
-        data = result[dst] = {}
-        for elt in child.childNodes:
-            if elt.attributes and elt.attributes.has_key('type'):
-                key, value = elt.attributes['type'].value, elt.childNodes[0].wholeText
-                # Don't over-write previously-read data for an alt form:
-                if elt.attributes.has_key('alt') and data.has_key(key):
-                    continue
-                data[key] = value
-
-    return result
-
-def findTagsInFile(file, path):
-    doc = parseDoc(file)
-
-    elt = doc.documentElement
-    tag_spec_list = path.split("/")
-    last_entry = None
-    for tag_spec in tag_spec_list:
-        tag_name = tag_spec
-        arg_name = 'type'
-        arg_value = ''
-        left_bracket = tag_spec.find('[')
-        if left_bracket != -1:
-            tag_name = tag_spec[:left_bracket]
-            arg_value = tag_spec[left_bracket+1:-1].split("=")
-            if len(arg_value) == 2:
-                arg_name = arg_value[0]
-                arg_value = arg_value[1]
-            else:
-                arg_value = arg_value[0]
-        elt = findChild(elt, tag_name, arg_name, arg_value)
-        if not elt:
-            return None
-    ret = []
-    if elt.childNodes:
-        for node in elt.childNodes:
-            if node.attributes:
-                element = [node.nodeName, None]
-                element[1] = node.attributes.items()
-                ret.append(element)
-    else:
-        if elt.attributes:
-            element = [elt.nodeName, None]
-            element[1] = elt.attributes.items()
-            ret.append(element)
-    return ret
-
-def _findEntryInFile(file, path, draft=None, attribute=None):
-    doc = parseDoc(file)
-
-    elt = doc.documentElement
-    tag_spec_list = path.split("/")
-    last_entry = None
-    for i in range(len(tag_spec_list)):
-        tag_spec = tag_spec_list[i]
-        tag_name = tag_spec
-        arg_name = 'type'
-        arg_value = ''
-        left_bracket = tag_spec.find('[')
-        if left_bracket != -1:
-            tag_name = tag_spec[:left_bracket]
-            arg_value = tag_spec[left_bracket+1:-1].split("=")
-            if len(arg_value) == 2:
-                arg_name = arg_value[0].replace("@", "").replace("'", "")
-                arg_value = arg_value[1]
-            else:
-                arg_value = arg_value[0]
-        alias = findChild(elt, 'alias')
-        if alias and alias.attributes['source'].nodeValue == 'locale':
-            path = alias.attributes['path'].nodeValue
-            aliaspath = tag_spec_list[:i] + path.split("/")
-            def resolve(x, y):
-                if y == '..':
-                    return x[:-1]
-                return x + [y]
-            # resolve all dot-dot parts of the path
-            aliaspath = reduce(resolve, aliaspath, [])
-            # remove attribute specification that our xpathlite doesnt support
-            aliaspath = map(lambda x: x.replace("@type=", "").replace("'", ""), aliaspath)
-            # append the remaining path
-            aliaspath = aliaspath + tag_spec_list[i:]
-            aliaspath = "/".join(aliaspath)
-            # "locale" aliases are special - we need to start lookup from scratch
-            return (None, aliaspath)
-        elt = findChild(elt, tag_name, arg_name, arg_value, draft)
-        if not elt:
-            return ("", None)
-    if attribute is not None:
-        if elt.attributes.has_key(attribute):
-            return (elt.attributes[attribute].nodeValue, None)
-        return (None, None)
-    try:
-        return (elt.firstChild.nodeValue, None)
-    except:
-        pass
-    return (None, None)
-
-def findAlias(file):
-    doc = parseDoc(file)
-
-    alias_elt = findChild(doc.documentElement, "alias")
-    if not alias_elt:
-        return False
-    if not alias_elt.attributes.has_key('source'):
-        return False
-    return alias_elt.attributes['source'].nodeValue
-
-lookup_chain_cache = {}
-parent_locales = {}
-def _fixedLookupChain(dirname, name):
-    if lookup_chain_cache.has_key(name):
-        return lookup_chain_cache[name]
-
-    # see http://www.unicode.org/reports/tr35/#Parent_Locales
-    if not parent_locales:
-        for ns in findTagsInFile(dirname + "/../supplemental/supplementalData.xml", "parentLocales"):
-            tmp = {}
-            parent_locale = ""
-            for data in ns[1:][0]: # ns looks like this: [u'parentLocale', [(u'parent', u'root'), (u'locales', u'az_Cyrl bs_Cyrl en_Dsrt ..')]]
-                tmp[data[0]] = data[1]
-                if data[0] == u"parent":
-                    parent_locale = data[1]
-            parent_locales[parent_locale] = tmp[u"locales"].split(" ")
-
-    items = name.split("_")
-    # split locale name into items and iterate through them from back to front
-    # example: az_Latn_AZ => [az_Latn_AZ, az_Latn, az]
-    items = list(reversed(map(lambda x: "_".join(items[:x+1]), range(len(items)))))
-
-    for i in range(len(items)):
-        item = items[i]
-        for parent_locale in parent_locales.keys():
-            for locale in parent_locales[parent_locale]:
-                if item == locale:
-                    if parent_locale == u"root":
-                        items = items[:i+1]
-                    else:
-                        items = items[:i+1] + _fixedLookupChain(dirname, parent_locale)
-                    lookup_chain_cache[name] = items
-                    return items
-
-    lookup_chain_cache[name] = items
-    return items
-
-def _findEntry(base, path, draft=None, attribute=None):
-    if base.endswith(".xml"):
-        base = base[:-4]
-    (dirname, filename) = os.path.split(base)
-
-    items = _fixedLookupChain(dirname, filename)
-    for item in items:
-        file = dirname + "/" + item + ".xml"
-        if os.path.isfile(file):
-            alias = findAlias(file)
-            if alias:
-                # if alias is found we should follow it and stop processing current file
-                # see http://www.unicode.org/reports/tr35/#Common_Elements
-                aliasfile = os.path.dirname(file) + "/" + alias + ".xml"
-                if not os.path.isfile(aliasfile):
-                    raise Error("findEntry: fatal error: found an alias '%s' to '%s', but the alias file couldn't be found" % (filename, alias))
-                # found an alias, recurse into parsing it
-                result = _findEntry(aliasfile, path, draft, attribute)
-                return result
-            (result, aliaspath) = _findEntryInFile(file, path, draft, attribute)
-            if aliaspath:
-                # start lookup again because of the alias source="locale"
-                return _findEntry(base, aliaspath, draft, attribute)
-            if result:
-                return result
-    return None
-
-def findEntry(base, path, draft=None, attribute=None):
-    file = base
-    if base.endswith(".xml"):
-        file = base
-        base = base[:-4]
-    else:
-        file = base + ".xml"
-    (dirname, filename) = os.path.split(base)
-
-    result = None
-    while path:
-        result = _findEntry(base, path, draft, attribute)
-        if result:
-            return result
-        (result, aliaspath) = _findEntryInFile(dirname + "/root.xml", path, draft, attribute)
-        if result:
-            return result
-        if not aliaspath:
-            raise Error("findEntry: fatal error: %s: cannot find key %s" % (filename, path))
-        path = aliaspath
-
-    return result
-
author	Qt Forward Merge Bot <qt_forward_merge_bot@qt-project.org>	2020-04-07 01:00:12 +0200
committer	Fabian Kosmale <fabian.kosmale@qt.io>	2020-04-08 22:04:23 +0200
commit	c937ed8af4f3dfef3fd8f8c2a9815376790dd5bf (patch)
tree	5175aff87e160ae8f32dadc60d3cfd38b73d4fb1 /util
parent	e0346df1b21cb30b54ae8d4918addc9925fa8479 (diff)
parent	8823bb8d306d78dd6a2e121a708dc607beff58c8 (diff)