5 files changed, 972 insertions, 898 deletions
diff --git a/util/locale_database/cldr.py b/util/locale_database/cldr.py
index 7890adf307..94459b9e3f 100644
--- a/util/locale_database/cldr.py
+++ b/util/locale_database/cldr.py
@@ -27,7 +27,8 @@
 #############################################################################
 """Digesting the CLDR's data.
 
-Provides two class:
+Provides two classes:
+  CldrReader -- driver for reading CLDR data
   CldrAccess -- used by the reader to access the tree of data files
 
 The former should normally be all you need to access.
@@ -38,9 +39,206 @@ from xml.dom import minidom
 from weakref import WeakValueDictionary as CacheDict
 import os
 
-from localetools import Error
-from ldml import Node, Supplement
+from ldml import Error, Node, XmlScanner, Supplement, LocaleScanner
+from qlocalexml import Locale
 
+class CldrReader (object):
+    def __init__(self, root, grumble = lambda msg: None, whitter = lambda msg: None):
+        """Set up a reader object for reading CLDR data.
+
+        Single parameter, root, is the file-system path to the root of
+        the unpacked CLDR archive; its common/ sub-directory should
+        contain dtd/, main/ and supplemental/ sub-directories.
+
+        Optional second argument, grumble, is a callable that logs
+        warnings and complaints, e.g. sys.stderr.write would be a
+        suitable callable.  The default is a no-op that ignores its
+        single argument.  Optional third argument is similar, used for
+        less interesting output; pass sys.stderr.write for it for
+        verbose output."""
+        self.root = CldrAccess(root)
+        self.whitter, self.grumble = whitter, grumble
+
+    def likelySubTags(self):
+        """Generator for likely subtag information.
+
+        Yields pairs (have, give) of 4-tuples; if what you have
+        matches the left member, giving the right member is probably
+        sensible. Each 4-tuple's entries are the full names of a
+        language, a script, a country (strictly territory) and a
+        variant (currently ignored)."""
+        skips = []
+        for got, use in self.root.likelySubTags():
+            try:
+                have = self.__parseTags(got)
+                give = self.__parseTags(use)
+            except Error as e:
+                if ((use.startswith(got) or got.startswith('und_'))
+                    and e.message.startswith('Unknown ') and ' code ' in e.message):
+                    skips.append(use)
+                else:
+                    self.grumble('Skipping likelySubtag "{}" -> "{}" ({})\n'.format(got, use, e.message))
+                continue
+            if all(code.startswith('Any') and code[3].isupper() for code in have[:-1]):
+                continue
+
+            give = (give[0],
+                    # Substitute according to http://www.unicode.org/reports/tr35/#Likely_Subtags
+                    have[1] if give[1] == 'AnyScript' else give[1],
+                    have[2] if give[2] == 'AnyCountry' else give[2],
+                    give[3]) # AnyVariant similarly ?
+
+            yield have, give
+
+        if skips:
+            # TODO: look at LDML's reserved locale tag names; they
+            # show up a lot in this, and may be grounds for filtering
+            # more out.
+            pass # self.__wrapped(self.whitter, 'Skipping likelySubtags (for unknown codes): ', skips)
+
+    def readLocales(self, calendars = ('gregorian',)):
+        locales = tuple(self.__allLocales(calendars))
+        return dict(((k.language_id, k.script_id, k.country_id, k.variant_code),
+                     k) for k in locales)
+
+    def __allLocales(self, calendars):
+        def skip(locale, reason):
+            return 'Skipping defaultContent locale "{}" ({})\n'.format(locale, reason)
+
+        for locale in self.root.defaultContentLocales:
+            try:
+                language, script, country, variant = self.__splitLocale(locale)
+            except ValueError:
+                self.whitter(skip(locale, 'only language tag'))
+                continue
+
+            if not (script or country):
+                self.grumble(skip(locale, 'second tag is neither script nor territory'))
+                continue
+
+            if not (language and country):
+                continue
+
+            try:
+                yield self.__getLocaleData(self.root.locale(locale), calendars,
+                                           language, script, country, variant)
+            except Error as e:
+                self.grumble(skip(locale, e.message))
+
+        for locale in self.root.fileLocales:
+            try:
+                chain = self.root.locale(locale)
+                language, script, country, variant = chain.tagCodes()
+                assert language
+                # TODO: this skip should probably be based on likely
+                # sub-tags, instead of empty country: if locale has a
+                # likely-subtag expansion, that's what QLocale uses,
+                # and we'll be saving its data for the expanded locale
+                # anyway, so don't need to record it for itself.
+                # See also QLocaleXmlReader.loadLocaleMap's grumble.
+                if not country:
+                    continue
+                yield self.__getLocaleData(chain, calendars, language, script, country, variant)
+            except Error as e:
+                self.grumble('Skipping file locale "{}" ({})\n'.format(locale, e.message))
+
+    import textwrap
+    @staticmethod
+    def __wrapped(writer, prefix, tokens, wrap = textwrap.wrap):
+        writer('\n'.join(wrap(prefix + ', '.join(tokens),
+                              subsequent_indent=' ', width=80)) + '\n')
+    del textwrap
+
+    def __parseTags(self, locale):
+        tags = self.__splitLocale(locale)
+        language = tags.next()
+        script = country = variant = ''
+        try:
+            script, country, variant = tags
+        except ValueError:
+            pass
+        return tuple(p[1] for p in self.root.codesToIdName(language, script, country, variant))
+
+    def __splitLocale(self, name):
+        """Generate (language, script, territory, variant) from a locale name
+
+        Ignores any trailing fields (with a warning), leaves script (a
+        capitalised four-letter token), territory (either a number or
+        an all-uppercase token) or variant (upper case and digits)
+        empty if unspecified.  Only generates one entry if name is a
+        single tag (i.e. contains no underscores).  Always yields 1 or
+        4 values, never 2 or 3."""
+        tags = iter(name.split('_'))
+        yield tags.next() # Language
+        tag = tags.next() # may raise StopIteration
+
+        # Script is always four letters, always capitalised:
+        if len(tag) == 4 and tag[0].isupper() and tag[1:].islower():
+            yield tag
+            try:
+                tag = tags.next()
+            except StopIteration:
+                tag = ''
+        else:
+            yield ''
+
+        # Territory is upper-case or numeric:
+        if tag and tag.isupper() or tag.isdigit():
+            yield tag
+            try:
+                tag = tags.next()
+            except StopIteration:
+                tag = ''
+        else:
+            yield ''
+
+        # Variant can be any mixture of upper-case and digits.
+        if tag and all(c.isupper() or c.isdigit() for c in tag):
+            yield tag
+            tag = ''
+        else:
+            yield ''
+
+        # If nothing is left, StopIteration will avoid the warning:
+        if not tag:
+            tag = tags.next()
+        self.grumble('Ignoring unparsed cruft {} in {}\n'.format('_'.join(tag + tuple(tags)), name))
+
+    def __getLocaleData(self, scan, calendars, language, script, country, variant):
+        ids, names = zip(*self.root.codesToIdName(language, script, country, variant))
+        assert ids[0] > 0 and ids[2] > 0, (language, script, country, variant)
+        locale = Locale(
+            language = names[0], language_code = language, language_id = ids[0],
+            script = names[1], script_code = script, script_id = ids[1],
+            country = names[2], country_code = country, country_id = ids[2],
+            variant_code = variant)
+
+        firstDay, weStart, weEnd = self.root.weekData(country)
+        assert all(day in ('mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun')
+                   for day in (firstDay, weStart, weEnd))
+
+        locale.update(firstDayOfWeek = firstDay,
+                      weekendStart = weStart,
+                      weekendEnd = weEnd)
+
+        iso, digits, rounding = self.root.currencyData(country)
+        locale.update(currencyIsoCode = iso,
+                      currencyDigits = int(digits),
+                      currencyRounding = int(rounding))
+
+        locale.update(scan.currencyData(iso))
+        locale.update(scan.numericData(self.root.numberSystem, self.whitter))
+        locale.update(scan.textPatternData())
+        locale.update(scan.endonyms(language, script, country, variant))
+        locale.update(scan.unitData()) # byte, kB, MB, GB, ..., KiB, MiB, GiB, ...
+        locale.update(scan.calendarNames(calendars)) # Names of days and months
+
+        return locale
+
+# Note: various caches assume this class is a singleton, so the
+# "default" value for a parameter no caller should pass can serve as
+# the cache. If a process were to instantiate this class with distinct
+# roots, each cache would be filled by the first to need it !
 class CldrAccess (object):
     def __init__(self, root):
         """Set up a master object for accessing CLDR data.
@@ -50,6 +248,12 @@ class CldrAccess (object):
         contain dtd/, main/ and supplemental/ sub-directories."""
         self.root = root
 
+    def xml(self, *path):
+        """Load a single XML file and return its root element as an XmlScanner.
+
+        The path is interpreted relative to self.root"""
+        return XmlScanner(Node(self.__xml(path)))
+
     def supplement(self, name):
         """Loads supplemental data as a Supplement object.
 
@@ -57,6 +261,117 @@ class CldrAccess (object):
         """
         return Supplement(Node(self.__xml(('common', 'supplemental', name))))
 
+    def locale(self, name):
+        """Loads all data for a locale as a LocaleScanner object.
+
+        The name should be a locale name; adding suffix '.xml' to it
+        should usually yield a file in common/main/.  The returned
+        LocaleScanner object packages this file along with all those
+        from which it inherits; its methods know how to handle that
+        inheritance, where relevant."""
+        return LocaleScanner(name, self.__localeRoots(name), self.__rootLocale)
+
+    @property
+    def fileLocales(self, joinPath = os.path.join, listDirectory = os.listdir,
+                    splitExtension = os.path.splitext):
+        """Generator for locale IDs seen in file-names.
+
+        All *.xml other than root.xml in common/main/ are assumed to
+        identify locales."""
+        for name in listDirectory(joinPath(self.root, 'common', 'main')):
+            stem, ext = splitExtension(name)
+            if ext == '.xml' and stem != 'root':
+                yield stem
+
+    @property
+    def defaultContentLocales(self):
+        """Generator for the default content locales."""
+        for name, attrs in self.supplement('supplementalMetadata.xml').find('metadata/defaultContent'):
+            try:
+                locales = attrs['locales']
+            except KeyError:
+                pass
+            else:
+                for locale in locales.split():
+                    yield locale
+
+    def likelySubTags(self):
+        for ignore, attrs in self.supplement('likelySubtags.xml').find('likelySubtags'):
+            yield attrs['from'], attrs['to']
+
+    def numberSystem(self, system):
+        """Get a description of a numbering system.
+
+        Returns a mapping, with keys u'digits', u'type' and u'id'; the
+        value for this last is system. Raises KeyError for unknown
+        number system, ldml.Error on failure to load data."""
+        try:
+            return self.__numberSystems[system]
+        except KeyError:
+            raise Error('Unsupported number system: {}'.format(system))
+
+    def weekData(self, country):
+        """Data on the weekly cycle.
+
+        Returns a triple (W, S, E) of en's short names for week-days;
+        W is the first day of the week, S the start of the week-end
+        and E the end of the week-end.  Where data for a country is
+        unavailable, the data for CLDR's territory 001 (The World) is
+        used."""
+        try:
+            return self.__weekData[country]
+        except KeyError:
+            return self.__weekData['001']
+
+    def currencyData(self, country):
+        """Returns currency data for the given country code.
+
+        Return value is a tuple (ISO4217 code, digit count, rounding
+        mode).  If CLDR provides no data for this country, ('', 2, 1)
+        is the default result.
+        """
+        try:
+            return self.__currencyData[country]
+        except KeyError:
+            return '', 2, 1
+
+    def codesToIdName(self, language, script, country, variant = ''):
+        """Maps each code to the appropriate ID and name.
+
+        Returns a 4-tuple of (ID, name) pairs corresponding to the
+        language, script, country and variant given.  Raises a
+        suitable error if any of them is unknown, indicating all that
+        are unknown plus suitable names for any that could sensibly be
+        added to enumdata.py to make them known.
+
+        Until we implement variant support (QTBUG-81051), the fourth
+        member of the returned tuple is always 0 paired with a string
+        that should not be used."""
+        enum = self.__enumMap
+        try:
+            return (enum('language')[language],
+                    enum('script')[script],
+                    enum('country')[country],
+                    enum('variant')[variant])
+        except KeyError:
+            pass
+
+        parts, values = [], [language, script, country, variant]
+        for index, key in enumerate(('language', 'script', 'country', 'variant')):
+            naming, enums = self.__codeMap(key), enum(key)
+            value = values[index]
+            if value not in enums:
+                text = '{} code {}'.format(key, value)
+                name = naming.get(value)
+                if name and value != 'POSIX':
+                    text += u' (could add {})'.format(name)
+                parts.append(text)
+        if len(parts) > 1:
+            parts[-1] = 'and ' + parts[-1]
+        assert parts
+        raise Error('Unknown ' + ', '.join(parts),
+                    language, script, country, variant)
+
     def readWindowsTimeZones(self, lookup): # For use by cldr2qtimezone.py
         """Digest CLDR's MS-Win time-zone name mapping.
 
@@ -139,11 +454,97 @@ class CldrAccess (object):
         return open(joinPath(self.root, *path))
 
     @property
+    def __rootLocale(self, cache = []):
+        if not cache:
+            cache.append(self.xml('common', 'main', 'root.xml'))
+        return cache[0]
+
+    @property
     def __supplementalData(self, cache = []):
         if not cache:
             cache.append(self.supplement('supplementalData.xml'))
         return cache[0]
 
+    @property
+    def __numberSystems(self, cache = {}, joinPath=os.path.join):
+        if not cache:
+            for ignore, attrs in self.supplement('numberingSystems.xml').find('numberingSystems'):
+                if ord(attrs.get('digits', u'\x10000')[0]) > 0xffff:
+                    # FIXME, QTBUG-69324: make this redundant:
+                    # omit number system if zero doesn't fit in single-char16 UTF-16 :-(
+                    continue
+
+                cache[attrs['id']] = attrs
+            assert cache
+        return cache
+
+    @property
+    def __weekData(self, cache = {}):
+        if not cache:
+            firstDay, weStart, weEnd = self.__getWeekData()
+            # Massage those into an easily-consulted form:
+            # World defaults given for code '001':
+            mon, sat, sun = firstDay['001'], weStart['001'], weEnd['001']
+            lands = set(firstDay) | set(weStart) | set(weEnd)
+            cache.update((land,
+                          (firstDay.get(land, mon), weStart.get(land, sat), weEnd.get(land, sun)))
+                         for land in lands)
+            assert cache
+        return cache
+
+    def __getWeekData(self):
+        """Scan for data on the weekly cycle.
+
+        Yields three mappings from locales to en's short names for
+        week-days; if a locale isn't a key of a given mapping, it
+        should use the '001' (world) locale's value. The first mapping
+        gives the day on which the week starts, the second gives the
+        day on which the week-end starts, the third gives the last day
+        of the week-end."""
+        source = self.__supplementalData
+        for key in ('firstDay', 'weekendStart', 'weekendEnd'):
+            result = {}
+            for ignore, attrs in source.find('weekData/' + key):
+                assert ignore == key
+                day = attrs['day']
+                assert day in ('mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun'), day
+                if 'alt' in attrs:
+                    continue
+                for loc in attrs.get('territories', '').split():
+                    result[loc] = day
+            yield result
+
+    @property
+    def __currencyData(self, cache = {}):
+        if not cache:
+            source = self.__supplementalData
+            for elt in source.findNodes('currencyData/region'):
+                iso, digits, rounding = '', 2, 1
+                try:
+                    country = elt.dom.attributes['iso3166'].nodeValue
+                except KeyError:
+                    continue
+                for child in elt.findAllChildren('currency'):
+                    try:
+                        if child.dom.attributes['tender'].nodeValue == 'false':
+                            continue
+                    except KeyError:
+                        pass
+                    try:
+                        child.dom.attributes['to'] # Is set if this element has gone out of date.
+                    except KeyError:
+                        iso = child.dom.attributes['iso4217'].nodeValue
+                        break
+                if iso:
+                    for tag, data in source.find(
+                        'currencyData/fractions/info[iso4217={}]'.format(iso)):
+                        digits = data['digits']
+                        rounding = data['rounding']
+                cache[country] = iso, digits, rounding
+            assert cache
+
+        return cache
+
     def __scanLdmlDtd(self, joinPath = os.path.join):
         """Scan the LDML DTD, record CLDR version."""
         with self.__open(('common', 'dtd', 'ldml.dtd')) as dtd:
@@ -151,7 +552,8 @@ class CldrAccess (object):
                 if line.startswith('<!ATTLIST '):
                     parts = line.split()
                     if parts[1:5] == ['version', 'cldrVersion', 'CDATA', '#FIXED']:
-                        # parts[5] is the version, in quotes, although the final > might be stuck on its end:
+                        # parts[5] is the version, in quotes, maybe
+                        # with a final > attached to its end:
                         self.__cldrVersion = parts[5].split('"')[1]
                         break
 
@@ -178,5 +580,93 @@ class CldrAccess (object):
 
         return cache[key]
 
+    def __codeMap(self, key, cache = {},
+                  # Maps our name for it to CLDR's name:
+                  naming = {'language': 'languages', 'script': 'scripts',
+                            'country': 'territories', 'variant': 'variants'}):
+        if not cache:
+            root = self.xml('common', 'main', 'en.xml').root.findUniqueChild('localeDisplayNames')
+            for dst, src in naming.items():
+                cache[dst] = dict(self.__codeMapScan(root.findUniqueChild(src)))
+            assert cache
+
+        return cache[key]
+
+    def __codeMapScan(self, node):
+        """Get mapping from codes to element values.
+
+        Passed in node is a <languages>, <scripts>, <territories> or
+        <variants> node, each child of which is a <language>,
+        <script>, <territory> or <variant> node as appropriate, whose
+        type is a code (of the appropriate flavour) and content is its
+        full name.  In some cases, two child nodes have the same type;
+        in these cases, one always has an alt attribute and we should
+        prefer the other.  Yields all such type, content pairs found
+        in node's children (skipping any with an alt attribute, if
+        their type has been seen previously)."""
+        seen = set()
+        for elt in node.dom.childNodes:
+            try:
+                key, value = elt.attributes['type'].nodeValue, elt.childNodes[0].wholeText
+            except (KeyError, ValueError, TypeError):
+                pass
+            else:
+                if key not in seen or not elt.attributes.has_key('alt'):
+                    yield key, value
+                    seen.add(key)
+
+    # CLDR uses inheritance between locales to save repetition:
+    def __parentLocale(self, name, cache = {}):
+        # see http://www.unicode.org/reports/tr35/#Parent_Locales
+        if not cache:
+            for tag, attrs in self.__supplementalData.find('parentLocales'):
+                parent = attrs.get('parent', '')
+                for child in attrs['locales'].split():
+                    cache[child] = parent
+            assert cache
+
+        return cache[name]
+
+    def __localeAsDoc(self, name, aliasFor = None,
+                      joinPath = os.path.join, exists = os.path.isfile):
+        path = ('common', 'main', name + '.xml')
+        if exists(joinPath(self.root, *path)):
+            elt = self.__xml(path)
+            for child in Node(elt).findAllChildren('alias'):
+                try:
+                    alias = child.dom.attributes['source'].nodeValue
+                except (KeyError, AttributeError):
+                    pass
+                else:
+                    return self.__localeAsDoc(alias, aliasFor or name)
+            # No alias child with a source:
+            return elt
+
+        if aliasFor:
+            raise Error('Fatal error: found an alias "{}" -> "{}", but found no file for the alias'
+                        .format(aliasFor, name))
+
+    def __scanLocaleRoots(self, name):
+        while name and name != 'root':
+            doc = self.__localeAsDoc(name)
+            if doc is not None:
+                yield Node(doc)
+
+            try:
+                name = self.__parentLocale(name)
+            except KeyError:
+                try:
+                    name, tail = name.rsplit('_', 1)
+                except ValueError: # No tail to discard: we're done
+                    break
+
+    class __Seq (list): pass # No weakref for tuple and list, but list sub-class is ok.
+    def __localeRoots(self, name, cache = CacheDict()):
+        try:
+            chain = cache[name]
+        except KeyError:
+            cache[name] = chain = self.__Seq(self.__scanLocaleRoots(name))
+        return chain
+
 # Unpolute the namespace: we don't need to export these.
 del minidom, CacheDict, os
diff --git a/util/locale_database/cldr2qlocalexml.py b/util/locale_database/cldr2qlocalexml.py
index 41795ff634..b28dcecc45 100755
--- a/util/locale_database/cldr2qlocalexml.py
+++ b/util/locale_database/cldr2qlocalexml.py
@@ -2,7 +2,7 @@
 # coding=utf8
 #############################################################################
 ##
-## Copyright (C) 2018 The Qt Company Ltd.
+## Copyright (C) 2020 The Qt Company Ltd.
 ## Contact: https://www.qt.io/licensing/
 ##
 ## This file is part of the test suite of the Qt Toolkit.
@@ -31,15 +31,17 @@
 
 The CLDR data can be downloaded from CLDR_, which has a sub-directory
 for each version; you need the ``core.zip`` file for your version of
-choice (typically the latest).  This script has had updates to cope up
-to v35; for later versions, we may need adaptations.  Unpack the
+choice (typically the latest). This script has had updates to cope up
+to v35; for later versions, we may need adaptations. Unpack the
 downloaded ``core.zip`` and check it has a common/main/ sub-directory:
-pass the path of that sub-directory to this script as its single
-command-line argument.  Save its standard output (but not error) to a
-file for later processing by ``./qlocalexml2cpp.py``
+pass the path of that root of the download to this script as its first
+command-line argument. Pass the name of the file in which to write
+output as the second argument; either omit it or use '-' to select the
+standard output. This file is the input needed by
+``./qlocalexml2cpp.py``
 
 When you update the CLDR data, be sure to also update
-src/corelib/text/qt_attribution.json's entry for unicode-cldr.  Check
+src/corelib/text/qt_attribution.json's entry for unicode-cldr. Check
 this script's output for unknown language, country or script messages;
 if any can be resolved, use their entry in common/main/en.xml to
 append new entries to enumdata.py's lists and update documentation in
@@ -53,610 +55,62 @@ time zone names; see cldr2qtimezone.py for details.
 """
 
 import os
-import sys
-import re
-import textwrap
 
-import enumdata
 from localetools import Error
-from xpathlite import DraftResolution, findAlias, findEntry, findTagsInFile, codeMapsFromFile, \
-    _findEntryInFile as findEntryInFile
-from dateconverter import convert_date
-from qlocalexml import Locale, QLocaleXmlWriter
-
-# TODO: make calendars a command-line option
-calendars = ['gregorian', 'persian', 'islamic'] # 'hebrew'
-def wrappedwarn(err, prefix, tokens):
-    return err.write(
-        '\n'.join(textwrap.wrap(prefix + ', '.join(tokens),
-                                subsequent_indent=' ', width=80)) + '\n')
-
-def parse_number_format(patterns, data):
-    # this is a very limited parsing of the number format for currency only.
-    def skip_repeating_pattern(x):
-        p = x.replace('0', '#').replace(',', '').replace('.', '')
-        seen = False
-        result = ''
-        for c in p:
-            if c == '#':
-                if seen:
-                    continue
-                seen = True
-            else:
-                seen = False
-            result = result + c
-        return result
-    patterns = patterns.split(';')
-    result = []
-    for pattern in patterns:
-        pattern = skip_repeating_pattern(pattern)
-        pattern = pattern.replace('#', "%1")
-        # according to http://www.unicode.org/reports/tr35/#Number_Format_Patterns
-        # there can be doubled or trippled currency sign, however none of the
-        # locales use that.
-        pattern = pattern.replace(u'\xa4', "%2")
-        pattern = pattern.replace("''", "###").replace("'", '').replace("###", "'")
-        pattern = pattern.replace('-', data['minus'])
-        pattern = pattern.replace('+', data['plus'])
-        result.append(pattern)
-    return result
-
-cldr_dir = None
-def raiseUnknownCode(code, form, cache={}):
-    """Check whether an unknown code could be supported.
-
-    We declare a language, script or country code unknown if it's not
-    known to enumdata.py; however, if it's present in main/en.xml's
-    mapping of codes to names, we have the option of adding support.
-    This caches the necessary look-up (so we only read main/en.xml
-    once) and returns the name we should use if we do add support.
-
-    First parameter, code, is the unknown code.  Second parameter,
-    form, is one of 'language', 'script' or 'country' to select the
-    type of code to look up.  Do not pass further parameters (the next
-    will deprive you of the cache).
-
-    Raises localetools.Error with a suitable message, that includes
-    the unknown code's full name if found.
-
-    Relies on global cldr_dir being set before it's called; see tail
-    of this file.
-    """
-    if not cache:
-        cache.update(codeMapsFromFile(os.path.join(cldr_dir, 'en.xml')))
-    name = cache[form].get(code)
-    msg = 'unknown %s code "%s"' % (form, code)
-    if name:
-        msg += ' - could use "%s"' % name
-    raise Error(msg)
-
-def parse_list_pattern_part_format(pattern):
-    # This is a very limited parsing of the format for list pattern part only.
-    return pattern.replace("{0}", "%1").replace("{1}", "%2").replace("{2}", "%3")
-
-def unit_quantifiers(find, path, stem, suffix, known,
-                     # Stop at exa/exbi: 16 exbi = 2^{64} < zetta =
-                     # 1000^7 < zebi = 2^{70}, the next quantifiers up:
-                     si_quantifiers = ('kilo', 'mega', 'giga', 'tera', 'peta', 'exa')):
-    """Work out the unit quantifiers.
-
-    Unfortunately, the CLDR data only go up to terabytes and we want
-    all the way to exabytes; but we can recognize the SI quantifiers
-    as prefixes, strip and identify the tail as the localized
-    translation for 'B' (e.g. French has 'octet' for 'byte' and uses
-    ko, Mo, Go, To from which we can extrapolate Po, Eo).
-
-    Should be called first for the SI quantifiers, with suffix = 'B',
-    then for the IEC ones, with suffix = 'iB'; the list known
-    (initially empty before first call) is used to let the second call
-    know what the first learned about the localized unit.
-    """
-    if suffix == 'B': # first call, known = []
-        tail = suffix
-        for q in si_quantifiers:
-            it = find(path, stem % q)
-            # kB for kilobyte, in contrast with KiB for IEC:
-            q = q[0] if q == 'kilo' else q[0].upper()
-            if not it:
-                it = q + tail
-            elif it.startswith(q):
-                rest = it[1:]
-                tail = rest if all(rest == k for k in known) else suffix
-                known.append(rest)
-            yield it
-    else: # second call, re-using first's known
-        assert suffix == 'iB'
-        if known:
-            byte = known.pop()
-            if all(byte == k for k in known):
-                suffix = 'i' + byte
-        for q in si_quantifiers:
-            yield find(path, stem % q[:2],
-                       # Those don't (yet, v31) exist in CLDR, so we always fall back to:
-                       q[0].upper() + suffix)
-
-def generateLocaleInfo(path):
-    if not path.endswith(".xml"):
-        return {}
-
-    # skip legacy/compatibility ones
-    alias = findAlias(path)
-    if alias:
-        raise Error('Alias to "{}"'.format(alias))
-
-    def code(tag):
-        return findEntryInFile(path, 'identity/' + tag, attribute="type")[0]
-
-    return _generateLocaleInfo(path, code('language'), code('script'),
-                               code('territory'), code('variant'))
-
-def getNumberSystems(cache={}):
-    """Cached look-up of number system information.
-
-    Pass no arguments.  Returns a mapping from number system names to,
-    for each system, a mapping with keys 'digits', 'type' and 'id'.
-    Relies on global cldr_dir being set before it's first called.\n"""
-    if not cache:
-        for ns in findTagsInFile(os.path.join(cldr_dir, '..', 'supplemental',
-                                              'numberingSystems.xml'),
-                                 'numberingSystems'):
-            # ns has form: [u'numberingSystem', [(u'digits', u'0123456789'), (u'type', u'numeric'), (u'id', u'latn')]]
-            entry = dict(ns[1])
-            name = entry[u'id']
-            if u'digits' in entry and ord(entry[u'digits'][0]) > 0xffff:
-                # FIXME, QTBUG-69324: make this redundant:
-                # omit number system if zero doesn't fit in single-char16 UTF-16 :-(
-                sys.stderr.write('skipping number system "%s" [can\'t represent its zero, U+%X]\n'
-                                 % (name, ord(entry[u'digits'][0])))
-            else:
-                cache[name] = entry
-    return cache
-
-def _generateLocaleInfo(path, language_code, script_code, country_code, variant_code=""):
-    if not path.endswith(".xml"):
-        return {}
-
-    if language_code == 'root':
-        # just skip it
-        return {}
-
-    # we do not support variants
-    # ### actually there is only one locale with variant: en_US_POSIX
-    #     does anybody care about it at all?
-    if variant_code:
-        raise Error('We do not support variants ("{}")'.format(variant_code))
-
-    language_id = enumdata.languageCodeToId(language_code)
-    if language_id <= 0:
-        raiseUnknownCode(language_code, 'language')
-
-    script_id = enumdata.scriptCodeToId(script_code)
-    if script_id == -1:
-        raiseUnknownCode(script_code, 'script')
-
-    # we should handle fully qualified names with the territory
-    if not country_code:
-        return {}
-    country_id = enumdata.countryCodeToId(country_code)
-    if country_id <= 0:
-        raiseUnknownCode(country_code, 'country')
-
-    # So we say we accept only those values that have "contributed" or
-    # "approved" resolution. see http://www.unicode.org/cldr/process.html
-    # But we only respect the resolution for new datas for backward
-    # compatibility.
-    draft = DraftResolution.contributed
-
-    result = dict(
-        language=enumdata.language_list[language_id][0],
-        language_code=language_code, language_id=language_id,
-        script=enumdata.script_list[script_id][0],
-        script_code=script_code, script_id=script_id,
-        country=enumdata.country_list[country_id][0],
-        country_code=country_code, country_id=country_id,
-        variant_code=variant_code)
-
-    (dir_name, file_name) = os.path.split(path)
-    def from_supplement(tag,
-                        path=os.path.join(dir_name, '..', 'supplemental',
-                                          'supplementalData.xml')):
-        return findTagsInFile(path, tag)
-    currencies = from_supplement('currencyData/region[iso3166=%s]' % country_code)
-    result['currencyIsoCode'] = ''
-    result['currencyDigits'] = 2
-    result['currencyRounding'] = 1
-    if currencies:
-        for e in currencies:
-            if e[0] == 'currency':
-                t = [x[1] == 'false' for x in e[1] if x[0] == 'tender']
-                if t and t[0]:
-                    pass
-                elif not any(x[0] == 'to' for x in e[1]):
-                    result['currencyIsoCode'] = (x[1] for x in e[1] if x[0] == 'iso4217').next()
-                    break
-        if result['currencyIsoCode']:
-            t = from_supplement("currencyData/fractions/info[iso4217=%s]"
-                                % result['currencyIsoCode'])
-            if t and t[0][0] == 'info':
-                result['currencyDigits'] = (int(x[1]) for x in t[0][1] if x[0] == 'digits').next()
-                result['currencyRounding'] = (int(x[1]) for x in t[0][1] if x[0] == 'rounding').next()
-    numbering_system = None
-    try:
-        numbering_system = findEntry(path, "numbers/defaultNumberingSystem")
-    except Error:
-        pass
-    def findEntryDef(path, xpath, value=''):
-        try:
-            return findEntry(path, xpath)
-        except Error:
-            return value
-    def get_number_in_system(path, xpath, numbering_system):
-        if numbering_system:
-            try:
-                return findEntry(path, xpath + "[numberSystem=" + numbering_system + "]")
-            except Error:
-                # in CLDR 1.9 number system was refactored for numbers (but not for currency)
-                # so if previous findEntry doesn't work we should try this:
-                try:
-                    return findEntry(path, xpath.replace("/symbols/", "/symbols[numberSystem=" + numbering_system + "]/"))
-                except Error:
-                    # fallback to default
-                    pass
-        return findEntry(path, xpath)
-
-    result['decimal'] = get_number_in_system(path, "numbers/symbols/decimal", numbering_system)
-    result['group'] = get_number_in_system(path, "numbers/symbols/group", numbering_system)
-    assert result['decimal'] != result['group']
-    result['list'] = get_number_in_system(path, "numbers/symbols/list", numbering_system)
-    result['percent'] = get_number_in_system(path, "numbers/symbols/percentSign", numbering_system)
-    try:
-        result['zero'] = getNumberSystems()[numbering_system][u"digits"][0]
-    except Exception as e:
-        sys.stderr.write("Native zero detection problem: %s\n" % repr(e))
-        result['zero'] = get_number_in_system(path, "numbers/symbols/nativeZeroDigit", numbering_system)
-    result['minus'] = get_number_in_system(path, "numbers/symbols/minusSign", numbering_system)
-    result['plus'] = get_number_in_system(path, "numbers/symbols/plusSign", numbering_system)
-    result['exp'] = get_number_in_system(path, "numbers/symbols/exponential", numbering_system).lower()
-    result['quotationStart'] = findEntry(path, "delimiters/quotationStart")
-    result['quotationEnd'] = findEntry(path, "delimiters/quotationEnd")
-    result['alternateQuotationStart'] = findEntry(path, "delimiters/alternateQuotationStart")
-    result['alternateQuotationEnd'] = findEntry(path, "delimiters/alternateQuotationEnd")
-    result['listPatternPartStart'] = parse_list_pattern_part_format(findEntry(path, "listPatterns/listPattern/listPatternPart[start]"))
-    result['listPatternPartMiddle'] = parse_list_pattern_part_format(findEntry(path, "listPatterns/listPattern/listPatternPart[middle]"))
-    result['listPatternPartEnd'] = parse_list_pattern_part_format(findEntry(path, "listPatterns/listPattern/listPatternPart[end]"))
-    result['listPatternPartTwo'] = parse_list_pattern_part_format(findEntry(path, "listPatterns/listPattern/listPatternPart[2]"))
-    result['am'] = findEntry(path, "dates/calendars/calendar[gregorian]/dayPeriods/dayPeriodContext[format]/dayPeriodWidth[wide]/dayPeriod[am]", draft)
-    result['pm'] = findEntry(path, "dates/calendars/calendar[gregorian]/dayPeriods/dayPeriodContext[format]/dayPeriodWidth[wide]/dayPeriod[pm]", draft)
-    result['longDateFormat'] = convert_date(findEntry(path, "dates/calendars/calendar[gregorian]/dateFormats/dateFormatLength[full]/dateFormat/pattern"))
-    result['shortDateFormat'] = convert_date(findEntry(path, "dates/calendars/calendar[gregorian]/dateFormats/dateFormatLength[short]/dateFormat/pattern"))
-    result['longTimeFormat'] = convert_date(findEntry(path, "dates/calendars/calendar[gregorian]/timeFormats/timeFormatLength[full]/timeFormat/pattern"))
-    result['shortTimeFormat'] = convert_date(findEntry(path, "dates/calendars/calendar[gregorian]/timeFormats/timeFormatLength[short]/timeFormat/pattern"))
-
-    endonym = None
-    if country_code and script_code:
-        endonym = findEntryDef(path, "localeDisplayNames/languages/language[type=%s_%s_%s]" % (language_code, script_code, country_code))
-    if not endonym and script_code:
-        endonym = findEntryDef(path, "localeDisplayNames/languages/language[type=%s_%s]" % (language_code, script_code))
-    if not endonym and country_code:
-        endonym = findEntryDef(path, "localeDisplayNames/languages/language[type=%s_%s]" % (language_code, country_code))
-    if not endonym:
-        endonym = findEntryDef(path, "localeDisplayNames/languages/language[type=%s]" % (language_code))
-    result['languageEndonym'] = endonym
-    result['countryEndonym'] = findEntryDef(path, "localeDisplayNames/territories/territory[type=%s]" % (country_code))
-
-    currency_format = get_number_in_system(path, "numbers/currencyFormats/currencyFormatLength/currencyFormat/pattern", numbering_system)
-    currency_format = parse_number_format(currency_format, result)
-    result['currencyFormat'] = currency_format[0]
-    result['currencyNegativeFormat'] = ''
-    if len(currency_format) > 1:
-        result['currencyNegativeFormat'] = currency_format[1]
-
-    result['currencySymbol'] = ''
-    result['currencyDisplayName'] = ''
-    if result['currencyIsoCode']:
-        result['currencySymbol'] = findEntryDef(path, "numbers/currencies/currency[%s]/symbol" % result['currencyIsoCode'])
-        result['currencyDisplayName'] = ';'.join(
-            findEntryDef(path, 'numbers/currencies/currency[' + result['currencyIsoCode']
-                         + ']/displayName' + tail)
-            for tail in ['',] + [
-                '[count=%s]' % x for x in ('zero', 'one', 'two', 'few', 'many', 'other')
-                ]) + ';'
-
-    def findUnitDef(path, stem, fallback=''):
-        # The displayName for a quantified unit in en.xml is kByte
-        # instead of kB (etc.), so prefer any unitPattern provided:
-        for count in ('many', 'few', 'two', 'other', 'zero', 'one'):
-            try:
-                ans = findEntry(path, stem + 'unitPattern[count=%s]' % count)
-            except Error:
-                continue
-
-            # TODO: epxloit count-handling, instead of discarding placeholders
-            if ans.startswith('{0}'):
-                ans = ans[3:].lstrip()
-            if ans:
-                return ans
-
-        return findEntryDef(path, stem + 'displayName', fallback)
-
-    # First without quantifier, then quantified each way:
-    result['byte_unit'] = findEntryDef(
-        path, 'units/unitLength[type=long]/unit[type=digital-byte]/displayName',
-        'bytes')
-    stem = 'units/unitLength[type=short]/unit[type=digital-%sbyte]/'
-    known = [] # cases where we *do* have a given version:
-    result['byte_si_quantified'] = ';'.join(unit_quantifiers(findUnitDef, path, stem, 'B', known))
-    # IEC 60027-2
-    # http://physics.nist.gov/cuu/Units/binary.html
-    result['byte_iec_quantified'] = ';'.join(unit_quantifiers(findUnitDef, path, stem % '%sbi', 'iB', known))
-
-    # Used for month and day data:
-    namings = (
-        ('standaloneLong', 'stand-alone', 'wide'),
-        ('standaloneShort', 'stand-alone', 'abbreviated'),
-        ('standaloneNarrow', 'stand-alone', 'narrow'),
-        ('long', 'format', 'wide'),
-        ('short', 'format', 'abbreviated'),
-        ('narrow', 'format', 'narrow'),
-        )
-
-    # Month names for 12-month calendars:
-    for cal in calendars:
-        stem = 'dates/calendars/calendar[' + cal + ']/months/'
-        for (key, mode, size) in namings:
-            prop = 'monthContext[' + mode + ']/monthWidth[' + size + ']/'
-            result[key + 'Months_' + cal] = ';'.join(
-                findEntry(path, stem + prop + "month[%d]" % i)
-                for i in range(1, 13)) + ';'
-
-    # Day data (for Gregorian, at least):
-    stem = 'dates/calendars/calendar[gregorian]/days/'
-    days = ('sun', 'mon', 'tue', 'wed', 'thu', 'fri', 'sat')
-    for (key, mode, size) in namings:
-        prop = 'dayContext[' + mode + ']/dayWidth[' + size + ']/day'
-        result[key + 'Days'] = ';'.join(
-            findEntry(path, stem + prop + '[' + day + ']')
-            for day in days) + ';'
-
-    return Locale(result)
-
-def integrateWeekData(filePath, locale_database):
-    if not filePath.endswith(".xml"):
-        return {}
-
-    def lookup(key):
-        return findEntryInFile(filePath, key, attribute='territories')[0].split()
-    days = ('mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun')
-
-    firstDayByCountryCode = {}
-    for day in days:
-        for countryCode in lookup('weekData/firstDay[day=%s]' % day):
-            firstDayByCountryCode[countryCode] = day
-
-    weekendStartByCountryCode = {}
-    for day in days:
-        for countryCode in lookup('weekData/weekendStart[day=%s]' % day):
-            weekendStartByCountryCode[countryCode] = day
-
-    weekendEndByCountryCode = {}
-    for day in days:
-        for countryCode in lookup('weekData/weekendEnd[day=%s]' % day):
-            weekendEndByCountryCode[countryCode] = day
-
-    for (key, locale) in locale_database.iteritems():
-        countryCode = locale.country_code
-        if countryCode in firstDayByCountryCode:
-            locale.firstDayOfWeek = firstDayByCountryCode[countryCode]
-        else:
-            locale.firstDayOfWeek = firstDayByCountryCode["001"]
-
-        if countryCode in weekendStartByCountryCode:
-            locale.weekendStart = weekendStartByCountryCode[countryCode]
-        else:
-            locale.weekendStart = weekendStartByCountryCode["001"]
-
-        if countryCode in weekendEndByCountryCode:
-            locale.weekendEnd = weekendEndByCountryCode[countryCode]
-        else:
-            locale.weekendEnd = weekendEndByCountryCode["001"]
-
-def splitLocale(name):
-    """Split name into (language, script, territory) triple as generator.
-
-    Ignores any trailing fields (with a warning), leaves script (a capitalised
-    four-letter token) or territory (either a number or an all-uppercase token)
-    empty if unspecified, returns a single-entry generator if name is a single
-    tag (i.e. contains no underscores).  Always yields 1 or 3 values, never 2."""
-    tags = iter(name.split('_'))
-    yield tags.next() # Language
-    tag = tags.next()
-
-    # Script is always four letters, always capitalised:
-    if len(tag) == 4 and tag[0].isupper() and tag[1:].islower():
-        yield tag
-        try:
-            tag = tags.next()
-        except StopIteration:
-            tag = ''
-    else:
-        yield ''
-
-    # Territory is upper-case or numeric:
-    if tag and tag.isupper() or tag.isdigit():
-        yield tag
-        tag = ''
-    else:
-        yield ''
-
-    # If nothing is left, StopIteration will avoid the warning:
-    tag = (tag if tag else tags.next(),)
-    sys.stderr.write('Ignoring unparsed cruft %s in %s\n' % ('_'.join(tag + tuple(tags)), name))
-
-def _parseLocale(l):
-    language = "AnyLanguage"
-    script = "AnyScript"
-    country = "AnyCountry"
-
-    if l == "und":
-        raise Error('We treat unknown locale like C')
-
-    parsed = splitLocale(l)
-    language_code = parsed.next()
-    script_code = country_code = ''
-    try:
-        script_code, country_code = parsed
-    except ValueError:
-        pass
-
-    if language_code != "und":
-        language_id = enumdata.languageCodeToId(language_code)
-        if language_id == -1:
-            raise Error('Unknown language code "{}"'.format(language_code))
-        language = enumdata.language_list[language_id][0]
-
-    if script_code:
-        script_id = enumdata.scriptCodeToId(script_code)
-        if script_id == -1:
-            raise Error('Unknown script code "{}"'.format(script_code))
-        script = enumdata.script_list[script_id][0]
-
-    if country_code:
-        country_id = enumdata.countryCodeToId(country_code)
-        if country_id == -1:
-            raise Error('Unknown country code "{}"'.format(country_code))
-        country = enumdata.country_list[country_id][0]
-
-    return (language, script, country)
-
-def likelySubtags(root, err):
-    skips = []
-    for ns in findTagsInFile(os.path.join(root, 'supplemental', 'likelySubtags.xml'), "likelySubtags"):
-        tmp = {}
-        for data in ns[1:][0]: # ns looks like this: [u'likelySubtag', [(u'from', u'aa'), (u'to', u'aa_Latn_ET')]]
-            tmp[data[0]] = data[1]
-
-        try:
-            from_language, from_script, from_country = _parseLocale(tmp[u"from"])
-            to_language, to_script, to_country = _parseLocale(tmp[u"to"])
-        except Error as e:
-            if (tmp['to'].startswith(tmp['from'])
-                and e.message == 'Unknown language code "{}"'.format(tmp['from'])):
-                skips.append(tmp['to'])
-            else:
-                sys.stderr.write('skipping likelySubtag "{}" -> "{}" ({})\n'.format(
-                        tmp[u"from"], tmp[u"to"], e.message))
-            continue
-        # substitute according to http://www.unicode.org/reports/tr35/#Likely_Subtags
-        if to_country == "AnyCountry" and from_country != to_country:
-            to_country = from_country
-        if to_script == "AnyScript" and from_script != to_script:
-            to_script = from_script
-
-        yield ((from_language, from_script, from_country),
-               (to_language, to_script, to_country))
-    if skips:
-        wrappedwarn(err, 'skipping likelySubtags (for unknown language codes): ', skips)
+from cldr import CldrReader
+from qlocalexml import QLocaleXmlWriter
+from enumdata import language_list, script_list, country_list
 
 def usage(err, name, message = ''):
-    err.write("""Usage: {} <path-to-cldr-main> [out-file.xml]
-""".format(name)) # TODO: expand
+    err.write("""Usage: {} path/to/cldr/common/main [out-file.xml]
+""".format(name)) # TODO: expand command-line, improve help message
     if message:
         err.write('\n' + message + '\n')
 
 def main(args, out, err):
-    name = args.pop(0)
+    # TODO: make calendars a command-line option
+    calendars = ['gregorian', 'persian', 'islamic'] # 'hebrew'
 
-    if len(args) < 1:
-        usage(err, name)
+    # TODO: make argument parsing more sophisticated
+    name = args.pop(0)
+    if not args:
+        usage(name, err, 'Where is your CLDR data tree ?')
         return 1
 
-    global cldr_dir
-    cldr_dir = args.pop(0)
-    if not os.path.isdir(cldr_dir):
-        usage(err, name, 'Where did you unpack the CLDR data files ?')
+    root = args.pop(0)
+    if not os.path.exists(os.path.join(root, 'common', 'main', 'root.xml')):
+        usage(name, err,
+              'First argument is the root of the CLDR tree: found no common/main/root.xml under '
+              + root)
         return 1
 
-    if len(args) > 1:
-        usage(err, name, 'Too many arguments passed')
+    xml = args.pop(0) if args else None
+    if not xml or xml == '-':
+        emit = out
+    elif not xml.endswith('.xml'):
+        usage(name, err, 'Please use a .xml extension on your output file name, not ' + xml)
         return 1
-    if args:
-        qxml = open(args.pop(0), 'w')
     else:
-        qxml = out
-
-    getNumberSystems(cldr_dir)
-    cldr_files = os.listdir(cldr_dir)
-    locale_database = {}
-
-    # see http://www.unicode.org/reports/tr35/tr35-info.html#Default_Content
-    defaultContent_locales = []
-    for ns in findTagsInFile(os.path.join(cldr_dir, '..', 'supplemental',
-                                          'supplementalMetadata.xml'),
-                             'metadata/defaultContent'):
-        for data in ns[1:][0]:
-            if data[0] == u"locales":
-                defaultContent_locales += data[1].split()
-
-    skips = []
-    for file in defaultContent_locales:
         try:
-            language_code, script_code, country_code = splitLocale(file)
-        except ValueError:
-            sys.stderr.write('skipping defaultContent locale "' + file + '" [neither two nor three tags]\n')
-            continue
+            emit = open(xml, 'w')
+        except IOError as e:
+            usage(name, err, 'Failed to open "{}" to write output to it\n'.format(xml))
+            return 1
 
-        if not (script_code or country_code):
-            sys.stderr.write('skipping defaultContent locale "' + file + '" [second tag is neither script nor territory]\n')
-            continue
-
-        try:
-            l = _generateLocaleInfo(cldr_dir + "/" + file + ".xml", language_code, script_code, country_code)
-            if not l:
-                skips.append(file)
-                continue
-        except Error as e:
-            sys.stderr.write('skipping defaultContent locale "{}" ({})\n'.format(file, e.message))
-            continue
-
-        locale_database[(l.language_id, l.script_id, l.country_id, l.variant_code)] = l
-
-    if skips:
-        wrappedwarn(err, 'skipping defaultContent locales [no locale info generated]: ', skips)
-        skips = []
-
-    for file in cldr_files:
-        try:
-            l = generateLocaleInfo(cldr_dir + "/" + file)
-            if not l:
-                skips.append(file)
-                continue
-        except Error as e:
-            sys.stderr.write('skipping file "{}" ({})\n'.format(file, e.message))
-            continue
-
-        locale_database[(l.language_id, l.script_id, l.country_id, l.variant_code)] = l
-
-    if skips:
-        wrappedwarn(err, 'skipping files [no locale info generated]: ', skips)
+    if args:
+        usage(name, err, 'Too many arguments - excess: ' + ' '.join(args))
+        return 1
 
-    integrateWeekData(cldr_dir + "/../supplemental/supplementalData.xml", locale_database)
-    cldr_version = 'unknown'
-    with open(cldr_dir+"/../dtd/ldml.dtd", "r") as ldml:
-        for line in ldml:
-            if 'version cldrVersion CDATA #FIXED' in line:
-                cldr_version = line.split('"')[1]
+    # TODO - command line options to tune choice of grumble and whitter:
+    reader = CldrReader(root, err.write, err.write)
+    writer = QLocaleXmlWriter(emit.write)
 
-    xmlOut = QLocaleXmlWriter(qxml.write)
-    xmlOut.version(cldr_version)
-    xmlOut.enumData(enumdata.language_list,
-                    enumdata.script_list,
-                    enumdata.country_list)
-    xmlOut.likelySubTags(likelySubtags(os.path.split(cldr_dir)[0], err))
-    xmlOut.locales(locale_database, calendars)
-    xmlOut.close()
-    if qxml is not out:
-        qxml.close()
+    writer.version(reader.root.cldrVersion)
+    writer.enumData(language_list, script_list, country_list)
+    writer.likelySubTags(reader.likelySubTags())
+    writer.locales(reader.readLocales(calendars), calendars)
 
+    writer.close()
     return 0
 
 if __name__ == '__main__':
diff --git a/util/locale_database/ldml.py b/util/locale_database/ldml.py
index 4aaa728a86..ff94f3da73 100644
--- a/util/locale_database/ldml.py
+++ b/util/locale_database/ldml.py
@@ -39,10 +39,12 @@ returned by minidom.parse() and their child-nodes:
   Node -- wraps any node in the DOM tree
   XmlScanner -- wraps the root element of a stand-alone XML file
   Supplement -- specializes XmlScanner for supplemental data files
+  LocaleScanner -- wraps a locale's inheritance-chain of file roots
 
 See individual classes for further detail.
 """
 from localetools import Error
+from dateconverter import convert_date
 
 class Node (object):
     """Wrapper for an arbitrary DOM node.
@@ -51,11 +53,20 @@ class Node (object):
     nodes are returned wrapped as Node objects.  A Node exposes the
     raw DOM node it wraps via its .dom attribute."""
 
-    def __init__(self, elt):
+    def __init__(self, elt, draft = 0):
         """Wraps a DOM node for ease of access.
 
-        Single argument, elt, is the DOM node to wrap."""
+        First argument, elt, is the DOM node to wrap. (Optional second
+        argument, draft, should only be supplied by this class's
+        creation of child nodes; it is the maximum draft score of any
+        ancestor of the new node.)"""
         self.dom = elt
+        try:
+            attr = elt.attributes['draft'].nodeValue
+        except KeyError:
+            self.draft = draft
+        else:
+            self.draft = max(draft, self.draftScore(attr))
 
     def findAllChildren(self, tag, wanted = None):
         """All children that do have the given tag and attributes.
@@ -65,34 +76,60 @@ class Node (object):
 
         Optional second argument, wanted, should either be None or map
         attribute names to the values they must have. Only child nodes
-        with these attributes set to the given values are yielded."""
+        with thes attributes set to the given values are yielded."""
 
-        cutoff = 4 # Only accept approved, for now
         for child in self.dom.childNodes:
             if child.nodeType != child.ELEMENT_NODE:
                 continue
             if child.nodeName != tag:
                 continue
 
-            try:
-                draft = child.attributes['draft']
-            except KeyError:
-                pass
-            else:
-                if self.__draftScores.get(draft, 0) < cutoff:
-                    continue
-
-            if wanted is not None:
+            if wanted:
                 try:
-                    if wanted and any(child.attributes[k].nodeValue != v for k, v in wanted.items()):
+                    if any(child.attributes[k].nodeValue != v
+                           for k, v in wanted.items()):
                         continue
                 except KeyError: # Some wanted attribute is missing
                     continue
 
-            yield Node(child)
+            yield Node(child, self.draft)
+
+    def findUniqueChild(self, tag):
+        """Returns the single child with the given nodeName.
+
+        Raises Error if there is no such child or there is more than
+        one."""
+        seq = self.findAllChildren(tag)
+        try:
+            node = seq.next()
+        except StopIteration:
+            raise Error('No child found where one was expected', tag)
+        for it in seq:
+            raise Error('Many children found where only one was expected', tag)
+        return node
+
+    @classmethod
+    def draftScore(cls, level):
+        """Maps draft level names to numeric scores.
+
+        Single parameter, level, is the least sure value of the draft
+        attribute on a node that you're willing to accept; returns a
+        numeric value (lower is less drafty).
 
-    __draftScores = dict(true = 0, unconfirmed = 1, provisional = 2,
-                         contributed = 3, approved = 4, false = 4)
+        Tempting as it is to insist on low draft scores, there are
+        many locales in which pretty much every leaf is
+        unconfirmed. It may make sense to actually check each
+        XmlScanner object, or each node in each LocaleScanner's nodes
+        list, to see what its distribution of draft level looks like,
+        so as to set the acceptable draft score for its elements
+        accordingly. However, for the moment, we mostly just accept
+        all elements, regardless of draft values (the one exception is
+        am/pm indicators)."""
+        return cls.__draftScores.get(level, 5) if level else 0
+
+    # Implementation details:
+    __draftScores = dict(true = 4, unconfirmed = 3, provisional = 2,
+                         contributed = 1, approved = 0, false = 0)
 
 def _parseXPath(selector):
     # Split "tag[attr=val][...]" into tag-name and attribute mapping
@@ -129,7 +166,6 @@ class XmlScanner (object):
         return elts
 
 class Supplement (XmlScanner):
-    # Replaces xpathlite.findTagsInFile()
     def find(self, xpath):
         elts = self.findNodes(xpath)
         for elt in _iterateEach(e.dom.childNodes if e.dom.childNodes else (e.dom,)
@@ -138,3 +174,381 @@ class Supplement (XmlScanner):
                 yield (elt.nodeName,
                        dict((k, v if isinstance(v, basestring) else v.nodeValue)
                             for k, v in elt.attributes.items()))
+
+class LocaleScanner (object):
+    def __init__(self, name, nodes, root):
+        self.name, self.nodes, self.base = name, nodes, root
+
+    def find(self, xpath, draft = None):
+        tags = xpath.split('/')
+        while True:
+            replace = None
+            for elt in self.nodes:
+                for selector in tags:
+                    tag, attrs = _parseXPath(selector)
+                    for elt in elt.findAllChildren(tag, attrs):
+                        if draft is None or elt.draft <= draft:
+                            break # and process the next selector
+                    else:
+                        break # no child, try next elt in self.nodes
+                else:
+                    # processed all selectors
+                    try:
+                        return elt.dom.firstChild.nodeValue
+                    except (AttributeError, KeyError):
+                        pass # move on to next elt in self.nodes
+
+            # No match in self.nodes; check root
+            elt = self.base.root
+            for i, selector in enumerate(tags):
+                tag, attrs = _parseXPath(selector)
+                for alias in elt.findAllChildren('alias'):
+                    if alias.dom.attributes['source'].nodeValue == 'locale':
+                        replace = alias.dom.attributes['path'].nodeValue.split('/')
+                        tags = self.__xpathJoin(tags[:i], replace, tags[i:])
+                        break
+                else:
+                    for elt in elt.findAllChildren(tag, attrs):
+                        if draft is None or elt.draft <= draft:
+                            break # and process the next selector
+                    else:
+                        break
+                if replace:
+                    break
+            else:
+                # processed all selectors
+                try:
+                    return elt.dom.firstChild.nodeValue
+                except (AttributeError, KeyError):
+                    # No match
+                    pass
+            if not replace:
+                break
+
+        sought = '/'.join(tags)
+        if sought != xpath:
+            sought += ' (for {})'.format(xpath)
+        raise Error('No {} in {}'.format(sought, self.name))
+
+    def findOr(self, xpath, fallback = ''):
+        """Use a fall-back value if we don't find data.
+
+        Like find, but takes a fall-back value to return instead of
+        raising Error on failure."""
+        try:
+            return self.find(xpath)
+        except Error:
+            return fallback
+
+    def tagCodes(self):
+        """Yields four tag codes
+
+        The tag codes are language, script, country and variant; an
+        empty value for any of them indicates that no value was
+        provided.  The values are obtained from the primary file's
+        top-level <identity> element.  An Error is raised if any
+        top-level <alias> element of this file has a non-empty source
+        attribute; that attribute value is mentioned in the error's
+        message."""
+        root = self.nodes[0]
+        for alias in root.findAllChildren('alias'):
+            try:
+                source = alias.dom.attributes['source'].nodeValue
+            except (KeyError, AttributeError):
+                pass
+            else:
+                raise Error('Alias to {}'.format(source))
+
+        ids = root.findUniqueChild('identity')
+        for code in ('language', 'script', 'territory', 'variant'):
+            for node in ids.findAllChildren(code):
+                try:
+                    yield node.dom.attributes['type'].nodeValue
+                except (KeyError, AttributeError):
+                    pass
+                else:
+                    break # only want one value for each code
+            else: # No value for this code, use empty
+                yield ''
+
+    def currencyData(self, isoCode):
+        """Fetches currency data for this locale.
+
+        Single argument, isoCode, is the ISO currency code for the
+        currency in use in the country. See also numericData, which
+        includes some currency formats.
+        """
+        if isoCode:
+            stem = 'numbers/currencies/currency[{}]/'.format(isoCode)
+            symbol = self.findOr(stem + 'symbol')
+            name = ';'.join(
+                self.findOr(stem + 'displayName' + tail)
+                for tail in ('',) + tuple(
+                    '[count={}]'.format(x) for x in ('zero', 'one', 'two', 'few', 'many', 'other')
+                )) + ';'
+        else:
+            symbol = name = ''
+        yield 'currencySymbol', symbol
+        yield 'currencyDisplayName', name
+
+    def numericData(self, lookup, complain = lambda text: None):
+        """Generate assorted numeric data for the locale.
+
+        First argument, lookup, is a callable that maps a numbering
+        system's name to certain data about the system, as a mapping;
+        we expect this to have u'digits' as a key.
+        """
+        system = self.find('numbers/defaultNumberingSystem')
+        stem = 'numbers/symbols[numberSystem={}]/'.format(system)
+        decimal = self.find(stem + 'decimal')
+        group = self.find(stem + 'group')
+        assert decimal != group, (self.name, system, decimal)
+        yield 'decimal', decimal
+        yield 'group', group
+        yield 'percent', self.find(stem + 'percentSign')
+        yield 'list', self.find(stem + 'list')
+        # FIXME: don't lower-case:
+        yield 'exp', self.find(stem + 'exponential').lower()
+
+        digits = lookup(system)['digits']
+        assert len(digits) == 10
+        zero = digits[0]
+        # Qt's number-formatting code assumes digits are consecutive:
+        assert all(ord(c) == i for i, c in enumerate(digits, ord(zero)))
+        yield 'zero', zero
+
+        plus = self.find(stem + 'plusSign')
+        minus = self.find(stem + 'minusSign')
+        yield 'plus', plus
+        yield 'minus', minus
+
+        # Currency formatting (currencyFormat may have a type field):
+        money = self.find('numbers/currencyFormats/currencyFormatLength/currencyFormat/pattern')
+        money = self.__currencyFormats(money, plus, minus)
+        yield 'currencyFormat', money.next()
+        neg = ''
+        for it in money:
+            assert not neg, 'There should be at most one more pattern'
+            neg = it
+        yield 'currencyNegativeFormat', neg
+
+    def textPatternData(self):
+        for key in ('quotationStart', 'alternateQuotationEnd',
+                    'quotationEnd', 'alternateQuotationStart'):
+            yield key, self.find('delimiters/' + key)
+
+        for key in ('start', 'middle', 'end'):
+            yield ('listPatternPart' + key.capitalize(),
+                   self.__fromLdmlListPattern(self.find(
+                        'listPatterns/listPattern/listPatternPart[{}]'.format(key))))
+        yield ('listPatternPartTwo',
+               self.__fromLdmlListPattern(self.find(
+                    'listPatterns/listPattern/listPatternPart[2]')))
+
+        stem = 'dates/calendars/calendar[gregorian]/'
+        # TODO: is wide really the right width to use here ?
+        # abbreviated might be an option ... or try both ?
+        meridiem = stem + 'dayPeriods/dayPeriodContext[format]/dayPeriodWidth[wide]/'
+        for key in ('am', 'pm'):
+            yield key, self.find(meridiem + 'dayPeriod[{}]'.format(key),
+                                 draft = Node.draftScore('contributed'))
+
+        for pair in (('long', 'full'), ('short', 'short')):
+            for key in ('time', 'date'):
+                yield (pair[0] + key.capitalize() + 'Format',
+                       convert_date(self.find(
+                            stem + '{}Formats/{}FormatLength[{}]/{}Format/pattern'.format(
+                                key, key, pair[1], key))))
+
+    def endonyms(self, language, script, country, variant):
+        # TODO: take variant into account ?
+        for seq in ((language, script, country),
+                    (language, script), (language, country), (language,)):
+            if not all(seq):
+                continue
+            try:
+                yield ('languageEndonym',
+                       self.find('localeDisplayNames/languages/language[{}]'
+                                 .format('_'.join(seq))))
+            except Error:
+                pass
+            else:
+                break
+        else:
+            # grumble(failed to find endonym for language)
+            yield 'languageEndonym', ''
+
+        yield ('countryEndonym',
+               self.findOr('localeDisplayNames/territories/territory[{}]'
+                           .format(country)))
+
+    def unitData(self):
+        yield ('byte_unit',
+               self.findOr('units/unitLength[long]/unit[digital-byte]/displayName',
+                           'bytes'))
+
+        unit = self.__findUnit('', 'B')
+        cache = [] # Populated by the SI call, to give hints to the IEC call
+        yield ('byte_si_quantified',
+               ';'.join(self.__unitCount('', unit, cache)))
+        # IEC 60027-2
+        # http://physics.nist.gov/cuu/Units/binary.html
+        yield ('byte_iec_quantified',
+               ';'.join(self.__unitCount('bi', 'iB', cache)))
+
+    def calendarNames(self, calendars):
+        namings = self.__nameForms
+        for cal in calendars:
+            stem = 'dates/calendars/calendar[' + cal + ']/months/'
+            for key, mode, size in namings:
+                prop = 'monthContext[' + mode + ']/monthWidth[' + size + ']/'
+                yield (key + 'Months_' + cal,
+                       ';'.join(self.find(stem + prop + 'month[{}]'.format(i))
+                                for i in range(1, 13)) + ';')
+
+        # Day data (for Gregorian, at least):
+        stem = 'dates/calendars/calendar[gregorian]/days/'
+        days = ('sun', 'mon', 'tue', 'wed', 'thu', 'fri', 'sat')
+        for (key, mode, size) in namings:
+            prop = 'dayContext[' + mode + ']/dayWidth[' + size + ']/day'
+            yield (key + 'Days',
+                   ';'.join(self.find(stem + prop + '[' + day + ']')
+                            for day in days) + ';')
+
+    # Implementation details
+    __nameForms = (
+        ('standaloneLong', 'stand-alone', 'wide'),
+        ('standaloneShort', 'stand-alone', 'abbreviated'),
+        ('standaloneNarrow', 'stand-alone', 'narrow'),
+        ('long', 'format', 'wide'),
+        ('short', 'format', 'abbreviated'),
+        ('narrow', 'format', 'narrow'),
+        ) # Used for month and day names
+
+    def __findUnit(self, keySuffix, quantify, fallback=''):
+        # The displayName for a quantified unit in en.xml is kByte
+        # (even for unitLength[narrow]) instead of kB (etc.), so
+        # prefer any unitPattern provided, but prune its placeholder:
+        for size in ('short', 'narrow'): # TODO: reverse order ?
+            stem = 'units/unitLength[{}]/unit[digital-{}byte]/'.format(size + keySuffix, quantify)
+            for count in ('many', 'few', 'two', 'other', 'zero', 'one'):
+                try:
+                    ans = self.find(stem + 'unitPattern[count={}]'.format(count))
+                except Error:
+                    continue
+
+                # TODO: do count-handling, instead of discarding placeholders
+                if False: # TODO: do it this way, instead !
+                    ans = ans.replace('{0}', '').strip()
+                elif ans.startswith('{0}'):
+                    ans = ans[3:].lstrip()
+                if ans:
+                    return ans
+
+            try:
+                return self.find(stem + 'displayName')
+            except Error:
+                pass
+
+        return fallback
+
+    def __unitCount(self, keySuffix, suffix, cache,
+                    # Stop at exa/exbi: 16 exbi = 2^{64} < zetta =
+                    # 1000^7 < zebi = 2^{70}, the next quantifiers up:
+                    siQuantifiers = ('kilo', 'mega', 'giga', 'tera', 'peta', 'exa')):
+        """Work out the unit quantifiers.
+
+        Unfortunately, the CLDR data only go up to terabytes and we
+        want all the way to exabytes; but we can recognize the SI
+        quantifiers as prefixes, strip and identify the tail as the
+        localized translation for 'B' (e.g. French has 'octet' for
+        'byte' and uses ko, Mo, Go, To from which we can extrapolate
+        Po, Eo).
+
+        Should be called first for the SI quantifiers, with suffix =
+        'B', then for the IEC ones, with suffix = 'iB'; the list cache
+        (initially empty before first call) is used to let the second
+        call know what the first learned about the localized unit.
+        """
+        if suffix == 'iB': # second call, re-using first's cache
+            if cache:
+                byte = cache.pop()
+                if all(byte == k for k in cache):
+                    suffix = 'i' + byte
+            for q in siQuantifiers:
+                # Those don't (yet, v36) exist in CLDR, so we always get the fall-back:
+                yield self.__findUnit(keySuffix, q[:2], q[0].upper() + suffix)
+        else: # first call
+            tail = suffix = suffix or 'B'
+            for q in siQuantifiers:
+                it = self.__findUnit(keySuffix, q)
+                # kB for kilobyte, in contrast with KiB for IEC:
+                q = q[0] if q == 'kilo' else q[0].upper()
+                if not it:
+                    it = q + tail
+                elif it.startswith(q):
+                    rest = it[1:]
+                    tail = rest if all(rest == k for k in cache) else suffix
+                    cache.append(rest)
+                yield it
+
+    @staticmethod
+    def __currencyFormats(patterns, plus, minus):
+        for p in patterns.split(';'):
+            p = p.replace('0', '#').replace(',', '').replace('.', '')
+            try:
+                cut = p.find('#') + 1
+            except ValueError:
+                pass
+            else:
+                p = p[:cut] + p[cut:].replace('#', '')
+            p = p.replace('#', "%1")
+            # According to http://www.unicode.org/reports/tr35/#Number_Format_Patterns
+            # there can be doubled or trippled currency sign, however none of the
+            # locales use that.
+            p = p.replace(u'\xa4', "%2")
+            # Single quote goes away, but double goes to single:
+            p = p.replace("''", '###').replace("'", '').replace('###', "'")
+            # Use number system's signs:
+            p = p.replace('+', plus).replace('-', minus)
+            yield p
+
+    @staticmethod
+    def __fromLdmlListPattern(pattern):
+        # This is a very limited parsing of the format for list pattern part only.
+        return pattern.replace('{0}', '%1').replace('{1}', '%2').replace('{2}', '%3')
+
+    @staticmethod
+    def __fromLdmlPath(seq): # tool function for __xpathJoin()
+        """Convert LDML's [@name='value'] to our [name=value] form."""
+        for it in seq:
+            # First dismember it:
+            attrs = it.split('[')
+            tag = attrs.pop(0)
+            if not attrs: # Short-cut the easy case:
+                yield it
+                continue
+
+            assert all(x.endswith(']') for x in attrs)
+            attrs = [x[:-1].split('=') for x in attrs]
+            # Then fix each attribute specification in it:
+            attrs = [(x[0][1:] if x[0].startswith('@') else x[0],
+                      x[1][1:-1] if x[1].startswith("'") and x[1].endswith("'") else x[1])
+                     for x in attrs]
+            # Finally, put it all back together:
+            attrs = ['='.join(x) + ']' for x in attrs]
+            attrs.insert(0, tag)
+            yield '['.join(attrs)
+
+    @classmethod
+    def __xpathJoin(cls, head, insert, tail):
+        """Join three lists of XPath selectors.
+
+        Each of head, insert and tail is a sequence of selectors but
+        insert may start with some uses of '..', that we want to
+        resolve away, and may use LDML's attribute format, that we
+        want to convert to our format."""
+        while insert and insert[0] == '..':
+            insert.pop(0)
+            head.pop()
+        return head + list(cls.__fromLdmlPath(insert)) + tail
diff --git a/util/locale_database/qlocalexml2cpp.py b/util/locale_database/qlocalexml2cpp.py
index 59161ed9d0..1938be19ea 100755
--- a/util/locale_database/qlocalexml2cpp.py
+++ b/util/locale_database/qlocalexml2cpp.py
@@ -480,7 +480,7 @@ def main(args, out, err):
         return 1
 
     reader = QLocaleXmlReader(qlocalexml)
-    locale_map = dict(reader.loadLocaleMap(calendars, sys.stderr.write))
+    locale_map = dict(reader.loadLocaleMap(calendars, err.write))
 
     locale_keys = locale_map.keys()
     compareLocaleKeys.default_map = dict(reader.defaultMap())
diff --git a/util/locale_database/xpathlite.py b/util/locale_database/xpathlite.py
deleted file mode 100644
index 3da8b24656..0000000000
--- a/util/locale_database/xpathlite.py
+++ /dev/null
@@ -1,284 +0,0 @@
-#!/usr/bin/env python
-#############################################################################
-##
-## Copyright (C) 2016 The Qt Company Ltd.
-## Contact: https://www.qt.io/licensing/
-##
-## This file is part of the test suite of the Qt Toolkit.
-##
-## $QT_BEGIN_LICENSE:GPL-EXCEPT$
-## Commercial License Usage
-## Licensees holding valid commercial Qt licenses may use this file in
-## accordance with the commercial license agreement provided with the
-## Software or, alternatively, in accordance with the terms contained in
-## a written agreement between you and The Qt Company. For licensing terms
-## and conditions see https://www.qt.io/terms-conditions. For further
-## information use the contact form at https://www.qt.io/contact-us.
-##
-## GNU General Public License Usage
-## Alternatively, this file may be used under the terms of the GNU
-## General Public License version 3 as published by the Free Software
-## Foundation with exceptions as appearing in the file LICENSE.GPL3-EXCEPT
-## included in the packaging of this file. Please review the following
-## information to ensure the GNU General Public License requirements will
-## be met: https://www.gnu.org/licenses/gpl-3.0.html.
-##
-## $QT_END_LICENSE$
-##
-#############################################################################
-
-import sys
-import os
-import xml.dom.minidom
-
-from localetools import Error
-
-class DraftResolution:
-    # See http://www.unicode.org/cldr/process.html for description
-    unconfirmed = 'unconfirmed'
-    provisional = 'provisional'
-    contributed = 'contributed'
-    approved = 'approved'
-    _values = { unconfirmed : 1, provisional : 2, contributed : 3, approved : 4 }
-    def __init__(self, resolution):
-        self.resolution = resolution
-    def toInt(self):
-        return DraftResolution._values[self.resolution]
-
-doc_cache = {}
-def parseDoc(file):
-    if not doc_cache.has_key(file):
-        doc_cache[file] = xml.dom.minidom.parse(file)
-    return doc_cache[file]
-
-def findChild(parent, tag_name, arg_name=None, arg_value=None, draft=None):
-    for node in parent.childNodes:
-        if node.nodeType != node.ELEMENT_NODE:
-            continue
-        if node.nodeName != tag_name:
-            continue
-        if arg_value:
-            if not node.attributes.has_key(arg_name):
-                continue
-            if node.attributes[arg_name].nodeValue != arg_value:
-                continue
-        if draft:
-            if not node.attributes.has_key('draft'):
-                # if draft is not specified then it's approved
-                return node
-            value = node.attributes['draft'].nodeValue
-            value = DraftResolution(value).toInt()
-            exemplar = DraftResolution(draft).toInt()
-            if exemplar > value:
-                continue
-        return node
-    return False
-
-def codeMapsFromFile(file):
-    """Extract mappings of language, script and country codes to names.
-
-    The file shall typically be common/main/en.xml, which contains a
-    localeDisplayNames element with children languages, scripts and
-    territories; each element in each of these has a code as its type
-    attribute and its name as element content.  This returns a mapping
-    withe keys 'language', 'script' and 'country', each of which
-    has, as value, a mapping of the relevant codes to names.
-    """
-    parent = findChild(findChild(parseDoc(file), 'ldml'), 'localeDisplayNames')
-    keys, result = {'languages': 'language', 'scripts': 'script', 'territories': 'country'}, {}
-    for src, dst in keys.items():
-        child = findChild(parent, src)
-        data = result[dst] = {}
-        for elt in child.childNodes:
-            if elt.attributes and elt.attributes.has_key('type'):
-                key, value = elt.attributes['type'].value, elt.childNodes[0].wholeText
-                # Don't over-write previously-read data for an alt form:
-                if elt.attributes.has_key('alt') and data.has_key(key):
-                    continue
-                data[key] = value
-
-    return result
-
-def findTagsInFile(file, path):
-    doc = parseDoc(file)
-
-    elt = doc.documentElement
-    tag_spec_list = path.split("/")
-    last_entry = None
-    for tag_spec in tag_spec_list:
-        tag_name = tag_spec
-        arg_name = 'type'
-        arg_value = ''
-        left_bracket = tag_spec.find('[')
-        if left_bracket != -1:
-            tag_name = tag_spec[:left_bracket]
-            arg_value = tag_spec[left_bracket+1:-1].split("=")
-            if len(arg_value) == 2:
-                arg_name = arg_value[0]
-                arg_value = arg_value[1]
-            else:
-                arg_value = arg_value[0]
-        elt = findChild(elt, tag_name, arg_name, arg_value)
-        if not elt:
-            return None
-    ret = []
-    if elt.childNodes:
-        for node in elt.childNodes:
-            if node.attributes:
-                element = [node.nodeName, None]
-                element[1] = node.attributes.items()
-                ret.append(element)
-    else:
-        if elt.attributes:
-            element = [elt.nodeName, None]
-            element[1] = elt.attributes.items()
-            ret.append(element)
-    return ret
-
-def _findEntryInFile(file, path, draft=None, attribute=None):
-    doc = parseDoc(file)
-
-    elt = doc.documentElement
-    tag_spec_list = path.split("/")
-    last_entry = None
-    for i in range(len(tag_spec_list)):
-        tag_spec = tag_spec_list[i]
-        tag_name = tag_spec
-        arg_name = 'type'
-        arg_value = ''
-        left_bracket = tag_spec.find('[')
-        if left_bracket != -1:
-            tag_name = tag_spec[:left_bracket]
-            arg_value = tag_spec[left_bracket+1:-1].split("=")
-            if len(arg_value) == 2:
-                arg_name = arg_value[0].replace("@", "").replace("'", "")
-                arg_value = arg_value[1]
-            else:
-                arg_value = arg_value[0]
-        alias = findChild(elt, 'alias')
-        if alias and alias.attributes['source'].nodeValue == 'locale':
-            path = alias.attributes['path'].nodeValue
-            aliaspath = tag_spec_list[:i] + path.split("/")
-            def resolve(x, y):
-                if y == '..':
-                    return x[:-1]
-                return x + [y]
-            # resolve all dot-dot parts of the path
-            aliaspath = reduce(resolve, aliaspath, [])
-            # remove attribute specification that our xpathlite doesnt support
-            aliaspath = map(lambda x: x.replace("@type=", "").replace("'", ""), aliaspath)
-            # append the remaining path
-            aliaspath = aliaspath + tag_spec_list[i:]
-            aliaspath = "/".join(aliaspath)
-            # "locale" aliases are special - we need to start lookup from scratch
-            return (None, aliaspath)
-        elt = findChild(elt, tag_name, arg_name, arg_value, draft)
-        if not elt:
-            return ("", None)
-    if attribute is not None:
-        if elt.attributes.has_key(attribute):
-            return (elt.attributes[attribute].nodeValue, None)
-        return (None, None)
-    try:
-        return (elt.firstChild.nodeValue, None)
-    except:
-        pass
-    return (None, None)
-
-def findAlias(file):
-    doc = parseDoc(file)
-
-    alias_elt = findChild(doc.documentElement, "alias")
-    if not alias_elt:
-        return False
-    if not alias_elt.attributes.has_key('source'):
-        return False
-    return alias_elt.attributes['source'].nodeValue
-
-lookup_chain_cache = {}
-parent_locales = {}
-def _fixedLookupChain(dirname, name):
-    if lookup_chain_cache.has_key(name):
-        return lookup_chain_cache[name]
-
-    # see http://www.unicode.org/reports/tr35/#Parent_Locales
-    if not parent_locales:
-        for ns in findTagsInFile(dirname + "/../supplemental/supplementalData.xml", "parentLocales"):
-            tmp = {}
-            parent_locale = ""
-            for data in ns[1:][0]: # ns looks like this: [u'parentLocale', [(u'parent', u'root'), (u'locales', u'az_Cyrl bs_Cyrl en_Dsrt ..')]]
-                tmp[data[0]] = data[1]
-                if data[0] == u"parent":
-                    parent_locale = data[1]
-            parent_locales[parent_locale] = tmp[u"locales"].split(" ")
-
-    items = name.split("_")
-    # split locale name into items and iterate through them from back to front
-    # example: az_Latn_AZ => [az_Latn_AZ, az_Latn, az]
-    items = list(reversed(map(lambda x: "_".join(items[:x+1]), range(len(items)))))
-
-    for i in range(len(items)):
-        item = items[i]
-        for parent_locale in parent_locales.keys():
-            for locale in parent_locales[parent_locale]:
-                if item == locale:
-                    if parent_locale == u"root":
-                        items = items[:i+1]
-                    else:
-                        items = items[:i+1] + _fixedLookupChain(dirname, parent_locale)
-                    lookup_chain_cache[name] = items
-                    return items
-
-    lookup_chain_cache[name] = items
-    return items
-
-def _findEntry(base, path, draft=None, attribute=None):
-    if base.endswith(".xml"):
-        base = base[:-4]
-    (dirname, filename) = os.path.split(base)
-
-    items = _fixedLookupChain(dirname, filename)
-    for item in items:
-        file = dirname + "/" + item + ".xml"
-        if os.path.isfile(file):
-            alias = findAlias(file)
-            if alias:
-                # if alias is found we should follow it and stop processing current file
-                # see http://www.unicode.org/reports/tr35/#Common_Elements
-                aliasfile = os.path.dirname(file) + "/" + alias + ".xml"
-                if not os.path.isfile(aliasfile):
-                    raise Error("findEntry: fatal error: found an alias '%s' to '%s', but the alias file couldn't be found" % (filename, alias))
-                # found an alias, recurse into parsing it
-                result = _findEntry(aliasfile, path, draft, attribute)
-                return result
-            (result, aliaspath) = _findEntryInFile(file, path, draft, attribute)
-            if aliaspath:
-                # start lookup again because of the alias source="locale"
-                return _findEntry(base, aliaspath, draft, attribute)
-            if result:
-                return result
-    return None
-
-def findEntry(base, path, draft=None, attribute=None):
-    file = base
-    if base.endswith(".xml"):
-        file = base
-        base = base[:-4]
-    else:
-        file = base + ".xml"
-    (dirname, filename) = os.path.split(base)
-
-    result = None
-    while path:
-        result = _findEntry(base, path, draft, attribute)
-        if result:
-            return result
-        (result, aliaspath) = _findEntryInFile(dirname + "/root.xml", path, draft, attribute)
-        if result:
-            return result
-        if not aliaspath:
-            raise Error("findEntry: fatal error: %s: cannot find key %s" % (filename, path))
-        path = aliaspath
-
-    return result
-