1 files changed, 494 insertions, 4 deletions
diff --git a/util/locale_database/cldr.py b/util/locale_database/cldr.py
index 7890adf307..94459b9e3f 100644
--- a/util/locale_database/cldr.py
+++ b/util/locale_database/cldr.py
@@ -27,7 +27,8 @@
 #############################################################################
 """Digesting the CLDR's data.
 
-Provides two class:
+Provides two classes:
+  CldrReader -- driver for reading CLDR data
   CldrAccess -- used by the reader to access the tree of data files
 
 The former should normally be all you need to access.
@@ -38,9 +39,206 @@ from xml.dom import minidom
 from weakref import WeakValueDictionary as CacheDict
 import os
 
-from localetools import Error
-from ldml import Node, Supplement
+from ldml import Error, Node, XmlScanner, Supplement, LocaleScanner
+from qlocalexml import Locale
 
+class CldrReader (object):
+    def __init__(self, root, grumble = lambda msg: None, whitter = lambda msg: None):
+        """Set up a reader object for reading CLDR data.
+
+        Single parameter, root, is the file-system path to the root of
+        the unpacked CLDR archive; its common/ sub-directory should
+        contain dtd/, main/ and supplemental/ sub-directories.
+
+        Optional second argument, grumble, is a callable that logs
+        warnings and complaints, e.g. sys.stderr.write would be a
+        suitable callable.  The default is a no-op that ignores its
+        single argument.  Optional third argument is similar, used for
+        less interesting output; pass sys.stderr.write for it for
+        verbose output."""
+        self.root = CldrAccess(root)
+        self.whitter, self.grumble = whitter, grumble
+
+    def likelySubTags(self):
+        """Generator for likely subtag information.
+
+        Yields pairs (have, give) of 4-tuples; if what you have
+        matches the left member, giving the right member is probably
+        sensible. Each 4-tuple's entries are the full names of a
+        language, a script, a country (strictly territory) and a
+        variant (currently ignored)."""
+        skips = []
+        for got, use in self.root.likelySubTags():
+            try:
+                have = self.__parseTags(got)
+                give = self.__parseTags(use)
+            except Error as e:
+                if ((use.startswith(got) or got.startswith('und_'))
+                    and e.message.startswith('Unknown ') and ' code ' in e.message):
+                    skips.append(use)
+                else:
+                    self.grumble('Skipping likelySubtag "{}" -> "{}" ({})\n'.format(got, use, e.message))
+                continue
+            if all(code.startswith('Any') and code[3].isupper() for code in have[:-1]):
+                continue
+
+            give = (give[0],
+                    # Substitute according to http://www.unicode.org/reports/tr35/#Likely_Subtags
+                    have[1] if give[1] == 'AnyScript' else give[1],
+                    have[2] if give[2] == 'AnyCountry' else give[2],
+                    give[3]) # AnyVariant similarly ?
+
+            yield have, give
+
+        if skips:
+            # TODO: look at LDML's reserved locale tag names; they
+            # show up a lot in this, and may be grounds for filtering
+            # more out.
+            pass # self.__wrapped(self.whitter, 'Skipping likelySubtags (for unknown codes): ', skips)
+
+    def readLocales(self, calendars = ('gregorian',)):
+        locales = tuple(self.__allLocales(calendars))
+        return dict(((k.language_id, k.script_id, k.country_id, k.variant_code),
+                     k) for k in locales)
+
+    def __allLocales(self, calendars):
+        def skip(locale, reason):
+            return 'Skipping defaultContent locale "{}" ({})\n'.format(locale, reason)
+
+        for locale in self.root.defaultContentLocales:
+            try:
+                language, script, country, variant = self.__splitLocale(locale)
+            except ValueError:
+                self.whitter(skip(locale, 'only language tag'))
+                continue
+
+            if not (script or country):
+                self.grumble(skip(locale, 'second tag is neither script nor territory'))
+                continue
+
+            if not (language and country):
+                continue
+
+            try:
+                yield self.__getLocaleData(self.root.locale(locale), calendars,
+                                           language, script, country, variant)
+            except Error as e:
+                self.grumble(skip(locale, e.message))
+
+        for locale in self.root.fileLocales:
+            try:
+                chain = self.root.locale(locale)
+                language, script, country, variant = chain.tagCodes()
+                assert language
+                # TODO: this skip should probably be based on likely
+                # sub-tags, instead of empty country: if locale has a
+                # likely-subtag expansion, that's what QLocale uses,
+                # and we'll be saving its data for the expanded locale
+                # anyway, so don't need to record it for itself.
+                # See also QLocaleXmlReader.loadLocaleMap's grumble.
+                if not country:
+                    continue
+                yield self.__getLocaleData(chain, calendars, language, script, country, variant)
+            except Error as e:
+                self.grumble('Skipping file locale "{}" ({})\n'.format(locale, e.message))
+
+    import textwrap
+    @staticmethod
+    def __wrapped(writer, prefix, tokens, wrap = textwrap.wrap):
+        writer('\n'.join(wrap(prefix + ', '.join(tokens),
+                              subsequent_indent=' ', width=80)) + '\n')
+    del textwrap
+
+    def __parseTags(self, locale):
+        tags = self.__splitLocale(locale)
+        language = tags.next()
+        script = country = variant = ''
+        try:
+            script, country, variant = tags
+        except ValueError:
+            pass
+        return tuple(p[1] for p in self.root.codesToIdName(language, script, country, variant))
+
+    def __splitLocale(self, name):
+        """Generate (language, script, territory, variant) from a locale name
+
+        Ignores any trailing fields (with a warning), leaves script (a
+        capitalised four-letter token), territory (either a number or
+        an all-uppercase token) or variant (upper case and digits)
+        empty if unspecified.  Only generates one entry if name is a
+        single tag (i.e. contains no underscores).  Always yields 1 or
+        4 values, never 2 or 3."""
+        tags = iter(name.split('_'))
+        yield tags.next() # Language
+        tag = tags.next() # may raise StopIteration
+
+        # Script is always four letters, always capitalised:
+        if len(tag) == 4 and tag[0].isupper() and tag[1:].islower():
+            yield tag
+            try:
+                tag = tags.next()
+            except StopIteration:
+                tag = ''
+        else:
+            yield ''
+
+        # Territory is upper-case or numeric:
+        if tag and tag.isupper() or tag.isdigit():
+            yield tag
+            try:
+                tag = tags.next()
+            except StopIteration:
+                tag = ''
+        else:
+            yield ''
+
+        # Variant can be any mixture of upper-case and digits.
+        if tag and all(c.isupper() or c.isdigit() for c in tag):
+            yield tag
+            tag = ''
+        else:
+            yield ''
+
+        # If nothing is left, StopIteration will avoid the warning:
+        if not tag:
+            tag = tags.next()
+        self.grumble('Ignoring unparsed cruft {} in {}\n'.format('_'.join(tag + tuple(tags)), name))
+
+    def __getLocaleData(self, scan, calendars, language, script, country, variant):
+        ids, names = zip(*self.root.codesToIdName(language, script, country, variant))
+        assert ids[0] > 0 and ids[2] > 0, (language, script, country, variant)
+        locale = Locale(
+            language = names[0], language_code = language, language_id = ids[0],
+            script = names[1], script_code = script, script_id = ids[1],
+            country = names[2], country_code = country, country_id = ids[2],
+            variant_code = variant)
+
+        firstDay, weStart, weEnd = self.root.weekData(country)
+        assert all(day in ('mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun')
+                   for day in (firstDay, weStart, weEnd))
+
+        locale.update(firstDayOfWeek = firstDay,
+                      weekendStart = weStart,
+                      weekendEnd = weEnd)
+
+        iso, digits, rounding = self.root.currencyData(country)
+        locale.update(currencyIsoCode = iso,
+                      currencyDigits = int(digits),
+                      currencyRounding = int(rounding))
+
+        locale.update(scan.currencyData(iso))
+        locale.update(scan.numericData(self.root.numberSystem, self.whitter))
+        locale.update(scan.textPatternData())
+        locale.update(scan.endonyms(language, script, country, variant))
+        locale.update(scan.unitData()) # byte, kB, MB, GB, ..., KiB, MiB, GiB, ...
+        locale.update(scan.calendarNames(calendars)) # Names of days and months
+
+        return locale
+
+# Note: various caches assume this class is a singleton, so the
+# "default" value for a parameter no caller should pass can serve as
+# the cache. If a process were to instantiate this class with distinct
+# roots, each cache would be filled by the first to need it !
 class CldrAccess (object):
     def __init__(self, root):
         """Set up a master object for accessing CLDR data.
@@ -50,6 +248,12 @@ class CldrAccess (object):
         contain dtd/, main/ and supplemental/ sub-directories."""
         self.root = root
 
+    def xml(self, *path):
+        """Load a single XML file and return its root element as an XmlScanner.
+
+        The path is interpreted relative to self.root"""
+        return XmlScanner(Node(self.__xml(path)))
+
     def supplement(self, name):
         """Loads supplemental data as a Supplement object.
 
@@ -57,6 +261,117 @@ class CldrAccess (object):
         """
         return Supplement(Node(self.__xml(('common', 'supplemental', name))))
 
+    def locale(self, name):
+        """Loads all data for a locale as a LocaleScanner object.
+
+        The name should be a locale name; adding suffix '.xml' to it
+        should usually yield a file in common/main/.  The returned
+        LocaleScanner object packages this file along with all those
+        from which it inherits; its methods know how to handle that
+        inheritance, where relevant."""
+        return LocaleScanner(name, self.__localeRoots(name), self.__rootLocale)
+
+    @property
+    def fileLocales(self, joinPath = os.path.join, listDirectory = os.listdir,
+                    splitExtension = os.path.splitext):
+        """Generator for locale IDs seen in file-names.
+
+        All *.xml other than root.xml in common/main/ are assumed to
+        identify locales."""
+        for name in listDirectory(joinPath(self.root, 'common', 'main')):
+            stem, ext = splitExtension(name)
+            if ext == '.xml' and stem != 'root':
+                yield stem
+
+    @property
+    def defaultContentLocales(self):
+        """Generator for the default content locales."""
+        for name, attrs in self.supplement('supplementalMetadata.xml').find('metadata/defaultContent'):
+            try:
+                locales = attrs['locales']
+            except KeyError:
+                pass
+            else:
+                for locale in locales.split():
+                    yield locale
+
+    def likelySubTags(self):
+        for ignore, attrs in self.supplement('likelySubtags.xml').find('likelySubtags'):
+            yield attrs['from'], attrs['to']
+
+    def numberSystem(self, system):
+        """Get a description of a numbering system.
+
+        Returns a mapping, with keys u'digits', u'type' and u'id'; the
+        value for this last is system. Raises KeyError for unknown
+        number system, ldml.Error on failure to load data."""
+        try:
+            return self.__numberSystems[system]
+        except KeyError:
+            raise Error('Unsupported number system: {}'.format(system))
+
+    def weekData(self, country):
+        """Data on the weekly cycle.
+
+        Returns a triple (W, S, E) of en's short names for week-days;
+        W is the first day of the week, S the start of the week-end
+        and E the end of the week-end.  Where data for a country is
+        unavailable, the data for CLDR's territory 001 (The World) is
+        used."""
+        try:
+            return self.__weekData[country]
+        except KeyError:
+            return self.__weekData['001']
+
+    def currencyData(self, country):
+        """Returns currency data for the given country code.
+
+        Return value is a tuple (ISO4217 code, digit count, rounding
+        mode).  If CLDR provides no data for this country, ('', 2, 1)
+        is the default result.
+        """
+        try:
+            return self.__currencyData[country]
+        except KeyError:
+            return '', 2, 1
+
+    def codesToIdName(self, language, script, country, variant = ''):
+        """Maps each code to the appropriate ID and name.
+
+        Returns a 4-tuple of (ID, name) pairs corresponding to the
+        language, script, country and variant given.  Raises a
+        suitable error if any of them is unknown, indicating all that
+        are unknown plus suitable names for any that could sensibly be
+        added to enumdata.py to make them known.
+
+        Until we implement variant support (QTBUG-81051), the fourth
+        member of the returned tuple is always 0 paired with a string
+        that should not be used."""
+        enum = self.__enumMap
+        try:
+            return (enum('language')[language],
+                    enum('script')[script],
+                    enum('country')[country],
+                    enum('variant')[variant])
+        except KeyError:
+            pass
+
+        parts, values = [], [language, script, country, variant]
+        for index, key in enumerate(('language', 'script', 'country', 'variant')):
+            naming, enums = self.__codeMap(key), enum(key)
+            value = values[index]
+            if value not in enums:
+                text = '{} code {}'.format(key, value)
+                name = naming.get(value)
+                if name and value != 'POSIX':
+                    text += u' (could add {})'.format(name)
+                parts.append(text)
+        if len(parts) > 1:
+            parts[-1] = 'and ' + parts[-1]
+        assert parts
+        raise Error('Unknown ' + ', '.join(parts),
+                    language, script, country, variant)
+
     def readWindowsTimeZones(self, lookup): # For use by cldr2qtimezone.py
         """Digest CLDR's MS-Win time-zone name mapping.
 
@@ -139,11 +454,97 @@ class CldrAccess (object):
         return open(joinPath(self.root, *path))
 
     @property
+    def __rootLocale(self, cache = []):
+        if not cache:
+            cache.append(self.xml('common', 'main', 'root.xml'))
+        return cache[0]
+
+    @property
     def __supplementalData(self, cache = []):
         if not cache:
             cache.append(self.supplement('supplementalData.xml'))
         return cache[0]
 
+    @property
+    def __numberSystems(self, cache = {}, joinPath=os.path.join):
+        if not cache:
+            for ignore, attrs in self.supplement('numberingSystems.xml').find('numberingSystems'):
+                if ord(attrs.get('digits', u'\x10000')[0]) > 0xffff:
+                    # FIXME, QTBUG-69324: make this redundant:
+                    # omit number system if zero doesn't fit in single-char16 UTF-16 :-(
+                    continue
+
+                cache[attrs['id']] = attrs
+            assert cache
+        return cache
+
+    @property
+    def __weekData(self, cache = {}):
+        if not cache:
+            firstDay, weStart, weEnd = self.__getWeekData()
+            # Massage those into an easily-consulted form:
+            # World defaults given for code '001':
+            mon, sat, sun = firstDay['001'], weStart['001'], weEnd['001']
+            lands = set(firstDay) | set(weStart) | set(weEnd)
+            cache.update((land,
+                          (firstDay.get(land, mon), weStart.get(land, sat), weEnd.get(land, sun)))
+                         for land in lands)
+            assert cache
+        return cache
+
+    def __getWeekData(self):
+        """Scan for data on the weekly cycle.
+
+        Yields three mappings from locales to en's short names for
+        week-days; if a locale isn't a key of a given mapping, it
+        should use the '001' (world) locale's value. The first mapping
+        gives the day on which the week starts, the second gives the
+        day on which the week-end starts, the third gives the last day
+        of the week-end."""
+        source = self.__supplementalData
+        for key in ('firstDay', 'weekendStart', 'weekendEnd'):
+            result = {}
+            for ignore, attrs in source.find('weekData/' + key):
+                assert ignore == key
+                day = attrs['day']
+                assert day in ('mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun'), day
+                if 'alt' in attrs:
+                    continue
+                for loc in attrs.get('territories', '').split():
+                    result[loc] = day
+            yield result
+
+    @property
+    def __currencyData(self, cache = {}):
+        if not cache:
+            source = self.__supplementalData
+            for elt in source.findNodes('currencyData/region'):
+                iso, digits, rounding = '', 2, 1
+                try:
+                    country = elt.dom.attributes['iso3166'].nodeValue
+                except KeyError:
+                    continue
+                for child in elt.findAllChildren('currency'):
+                    try:
+                        if child.dom.attributes['tender'].nodeValue == 'false':
+                            continue
+                    except KeyError:
+                        pass
+                    try:
+                        child.dom.attributes['to'] # Is set if this element has gone out of date.
+                    except KeyError:
+                        iso = child.dom.attributes['iso4217'].nodeValue
+                        break
+                if iso:
+                    for tag, data in source.find(
+                        'currencyData/fractions/info[iso4217={}]'.format(iso)):
+                        digits = data['digits']
+                        rounding = data['rounding']
+                cache[country] = iso, digits, rounding
+            assert cache
+
+        return cache
+
     def __scanLdmlDtd(self, joinPath = os.path.join):
         """Scan the LDML DTD, record CLDR version."""
         with self.__open(('common', 'dtd', 'ldml.dtd')) as dtd:
@@ -151,7 +552,8 @@ class CldrAccess (object):
                 if line.startswith('<!ATTLIST '):
                     parts = line.split()
                     if parts[1:5] == ['version', 'cldrVersion', 'CDATA', '#FIXED']:
-                        # parts[5] is the version, in quotes, although the final > might be stuck on its end:
+                        # parts[5] is the version, in quotes, maybe
+                        # with a final > attached to its end:
                         self.__cldrVersion = parts[5].split('"')[1]
                         break
 
@@ -178,5 +580,93 @@ class CldrAccess (object):
 
         return cache[key]
 
+    def __codeMap(self, key, cache = {},
+                  # Maps our name for it to CLDR's name:
+                  naming = {'language': 'languages', 'script': 'scripts',
+                            'country': 'territories', 'variant': 'variants'}):
+        if not cache:
+            root = self.xml('common', 'main', 'en.xml').root.findUniqueChild('localeDisplayNames')
+            for dst, src in naming.items():
+                cache[dst] = dict(self.__codeMapScan(root.findUniqueChild(src)))
+            assert cache
+
+        return cache[key]
+
+    def __codeMapScan(self, node):
+        """Get mapping from codes to element values.
+
+        Passed in node is a <languages>, <scripts>, <territories> or
+        <variants> node, each child of which is a <language>,
+        <script>, <territory> or <variant> node as appropriate, whose
+        type is a code (of the appropriate flavour) and content is its
+        full name.  In some cases, two child nodes have the same type;
+        in these cases, one always has an alt attribute and we should
+        prefer the other.  Yields all such type, content pairs found
+        in node's children (skipping any with an alt attribute, if
+        their type has been seen previously)."""
+        seen = set()
+        for elt in node.dom.childNodes:
+            try:
+                key, value = elt.attributes['type'].nodeValue, elt.childNodes[0].wholeText
+            except (KeyError, ValueError, TypeError):
+                pass
+            else:
+                if key not in seen or not elt.attributes.has_key('alt'):
+                    yield key, value
+                    seen.add(key)
+
+    # CLDR uses inheritance between locales to save repetition:
+    def __parentLocale(self, name, cache = {}):
+        # see http://www.unicode.org/reports/tr35/#Parent_Locales
+        if not cache:
+            for tag, attrs in self.__supplementalData.find('parentLocales'):
+                parent = attrs.get('parent', '')
+                for child in attrs['locales'].split():
+                    cache[child] = parent
+            assert cache
+
+        return cache[name]
+
+    def __localeAsDoc(self, name, aliasFor = None,
+                      joinPath = os.path.join, exists = os.path.isfile):
+        path = ('common', 'main', name + '.xml')
+        if exists(joinPath(self.root, *path)):
+            elt = self.__xml(path)
+            for child in Node(elt).findAllChildren('alias'):
+                try:
+                    alias = child.dom.attributes['source'].nodeValue
+                except (KeyError, AttributeError):
+                    pass
+                else:
+                    return self.__localeAsDoc(alias, aliasFor or name)
+            # No alias child with a source:
+            return elt
+
+        if aliasFor:
+            raise Error('Fatal error: found an alias "{}" -> "{}", but found no file for the alias'
+                        .format(aliasFor, name))
+
+    def __scanLocaleRoots(self, name):
+        while name and name != 'root':
+            doc = self.__localeAsDoc(name)
+            if doc is not None:
+                yield Node(doc)
+
+            try:
+                name = self.__parentLocale(name)
+            except KeyError:
+                try:
+                    name, tail = name.rsplit('_', 1)
+                except ValueError: # No tail to discard: we're done
+                    break
+
+    class __Seq (list): pass # No weakref for tuple and list, but list sub-class is ok.
+    def __localeRoots(self, name, cache = CacheDict()):
+        try:
+            chain = cache[name]
+        except KeyError:
+            cache[name] = chain = self.__Seq(self.__scanLocaleRoots(name))
+        return chain
+
 # Unpolute the namespace: we don't need to export these.
 del minidom, CacheDict, os