1 files changed, 697 insertions, 0 deletions
diff --git a/util/locale_database/cldr2qlocalexml.py b/util/locale_database/cldr2qlocalexml.py
new file mode 100755
index 0000000000..072ea9e4ed
--- /dev/null
+++ b/util/locale_database/cldr2qlocalexml.py
@@ -0,0 +1,697 @@
+#!/usr/bin/env python2
+# coding=utf8
+#############################################################################
+##
+## Copyright (C) 2018 The Qt Company Ltd.
+## Contact: https://www.qt.io/licensing/
+##
+## This file is part of the test suite of the Qt Toolkit.
+##
+## $QT_BEGIN_LICENSE:GPL-EXCEPT$
+## Commercial License Usage
+## Licensees holding valid commercial Qt licenses may use this file in
+## accordance with the commercial license agreement provided with the
+## Software or, alternatively, in accordance with the terms contained in
+## a written agreement between you and The Qt Company. For licensing terms
+## and conditions see https://www.qt.io/terms-conditions. For further
+## information use the contact form at https://www.qt.io/contact-us.
+##
+## GNU General Public License Usage
+## Alternatively, this file may be used under the terms of the GNU
+## General Public License version 3 as published by the Free Software
+## Foundation with exceptions as appearing in the file LICENSE.GPL3-EXCEPT
+## included in the packaging of this file. Please review the following
+## information to ensure the GNU General Public License requirements will
+## be met: https://www.gnu.org/licenses/gpl-3.0.html.
+##
+## $QT_END_LICENSE$
+##
+#############################################################################
+"""Convert CLDR data to qLocaleXML
+
+The CLDR data can be downloaded from CLDR_, which has a sub-directory
+for each version; you need the ``core.zip`` file for your version of
+choice (typically the latest).  This script has had updates to cope up
+to v35; for later versions, we may need adaptations.  Unpack the
+downloaded ``core.zip`` and check it has a common/main/ sub-directory:
+pass the path of that sub-directory to this script as its single
+command-line argument.  Save its standard output (but not error) to a
+file for later processing by ``./qlocalexml2cpp.py``
+
+When you update the CLDR data, be sure to also update
+src/corelib/text/qt_attribution.json's entry for unicode-cldr.  Check
+this script's output for unknown language, country or script messages;
+if any can be resolved, use their entry in common/main/en.xml to
+append new entries to enumdata.py's lists and update documentation in
+src/corelib/text/qlocale.qdoc, adding the new entries in alphabetic
+order.
+
+While updating the locale data, check also for updates to MS-Win's
+time zone names; see cldr2qtimezone.py for details.
+
+.. _CLDR: ftp://unicode.org/Public/cldr/
+"""
+
+import os
+import sys
+import re
+import textwrap
+
+import enumdata
+import xpathlite
+from xpathlite import DraftResolution, findAlias, findEntry, findTagsInFile
+from dateconverter import convert_date
+from localexml import Locale
+
+# TODO: make calendars a command-line option
+calendars = ['gregorian', 'persian', 'islamic'] # 'hebrew'
+findEntryInFile = xpathlite._findEntryInFile
+def wrappedwarn(prefix, tokens):
+    return sys.stderr.write(
+        '\n'.join(textwrap.wrap(prefix + ', '.join(tokens),
+                                subsequent_indent=' ', width=80)) + '\n')
+
+def parse_number_format(patterns, data):
+    # this is a very limited parsing of the number format for currency only.
+    def skip_repeating_pattern(x):
+        p = x.replace('0', '#').replace(',', '').replace('.', '')
+        seen = False
+        result = ''
+        for c in p:
+            if c == '#':
+                if seen:
+                    continue
+                seen = True
+            else:
+                seen = False
+            result = result + c
+        return result
+    patterns = patterns.split(';')
+    result = []
+    for pattern in patterns:
+        pattern = skip_repeating_pattern(pattern)
+        pattern = pattern.replace('#', "%1")
+        # according to http://www.unicode.org/reports/tr35/#Number_Format_Patterns
+        # there can be doubled or trippled currency sign, however none of the
+        # locales use that.
+        pattern = pattern.replace(u'\xa4', "%2")
+        pattern = pattern.replace("''", "###").replace("'", '').replace("###", "'")
+        pattern = pattern.replace('-', data['minus'])
+        pattern = pattern.replace('+', data['plus'])
+        result.append(pattern)
+    return result
+
+def raiseUnknownCode(code, form, cache={}):
+    """Check whether an unknown code could be supported.
+
+    We declare a language, script or country code unknown if it's not
+    known to enumdata.py; however, if it's present in main/en.xml's
+    mapping of codes to names, we have the option of adding support.
+    This caches the necessary look-up (so we only read main/en.xml
+    once) and returns the name we should use if we do add support.
+
+    First parameter, code, is the unknown code.  Second parameter,
+    form, is one of 'language', 'script' or 'country' to select the
+    type of code to look up.  Do not pass further parameters (the next
+    will deprive you of the cache).
+
+    Raises xpathlite.Error with a suitable message, that includes the
+    unknown code's full name if found.
+
+    Relies on global cldr_dir being set before it's called; see tail
+    of this file.
+    """
+    if not cache:
+        cache.update(xpathlite.codeMapsFromFile(os.path.join(cldr_dir, 'en.xml')))
+    name = cache[form].get(code)
+    msg = 'unknown %s code "%s"' % (form, code)
+    if name:
+        msg += ' - could use "%s"' % name
+    raise xpathlite.Error(msg)
+
+def parse_list_pattern_part_format(pattern):
+    # This is a very limited parsing of the format for list pattern part only.
+    return pattern.replace("{0}", "%1").replace("{1}", "%2").replace("{2}", "%3")
+
+def unit_quantifiers(find, path, stem, suffix, known,
+                     # Stop at exa/exbi: 16 exbi = 2^{64} < zetta =
+                     # 1000^7 < zebi = 2^{70}, the next quantifiers up:
+                     si_quantifiers = ('kilo', 'mega', 'giga', 'tera', 'peta', 'exa')):
+    """Work out the unit quantifiers.
+
+    Unfortunately, the CLDR data only go up to terabytes and we want
+    all the way to exabytes; but we can recognize the SI quantifiers
+    as prefixes, strip and identify the tail as the localized
+    translation for 'B' (e.g. French has 'octet' for 'byte' and uses
+    ko, Mo, Go, To from which we can extrapolate Po, Eo).
+
+    Should be called first for the SI quantifiers, with suffix = 'B',
+    then for the IEC ones, with suffix = 'iB'; the list known
+    (initially empty before first call) is used to let the second call
+    know what the first learned about the localized unit.
+    """
+    if suffix == 'B': # first call, known = []
+        tail = suffix
+        for q in si_quantifiers:
+            it = find(path, stem % q)
+            # kB for kilobyte, in contrast with KiB for IEC:
+            q = q[0] if q == 'kilo' else q[0].upper()
+            if not it:
+                it = q + tail
+            elif it.startswith(q):
+                rest = it[1:]
+                tail = rest if all(rest == k for k in known) else suffix
+                known.append(rest)
+            yield it
+    else: # second call, re-using first's known
+        assert suffix == 'iB'
+        if known:
+            byte = known.pop()
+            if all(byte == k for k in known):
+                suffix = 'i' + byte
+        for q in si_quantifiers:
+            yield find(path, stem % q[:2],
+                       # Those don't (yet, v31) exist in CLDR, so we always fall back to:
+                       q[0].upper() + suffix)
+
+def generateLocaleInfo(path):
+    if not path.endswith(".xml"):
+        return {}
+
+    # skip legacy/compatibility ones
+    alias = findAlias(path)
+    if alias:
+        raise xpathlite.Error('alias to "%s"' % alias)
+
+    def code(tag):
+        return findEntryInFile(path, 'identity/' + tag, attribute="type")[0]
+
+    return _generateLocaleInfo(path, code('language'), code('script'),
+                               code('territory'), code('variant'))
+
+def getNumberSystems(cache={}):
+    """Cached look-up of number system information.
+
+    Pass no arguments.  Returns a mapping from number system names to,
+    for each system, a mapping with keys u'digits', u'type' and
+    u'id'\n"""
+    if not cache:
+        for ns in findTagsInFile(os.path.join(cldr_dir, '..', 'supplemental',
+                                              'numberingSystems.xml'),
+                                 'numberingSystems'):
+            # ns has form: [u'numberingSystem', [(u'digits', u'0123456789'), (u'type', u'numeric'), (u'id', u'latn')]]
+            entry = dict(ns[1])
+            name = entry[u'id']
+            if u'digits' in entry and ord(entry[u'digits'][0]) > 0xffff:
+                # FIXME, QTBUG-69324: make this redundant:
+                # omit number system if zero doesn't fit in single-char16 UTF-16 :-(
+                sys.stderr.write('skipping number system "%s" [can\'t represent its zero, U+%X]\n'
+                                 % (name, ord(entry[u'digits'][0])))
+            else:
+                cache[name] = entry
+    return cache
+
+def _generateLocaleInfo(path, language_code, script_code, country_code, variant_code=""):
+    if not path.endswith(".xml"):
+        return {}
+
+    if language_code == 'root':
+        # just skip it
+        return {}
+
+    # we do not support variants
+    # ### actually there is only one locale with variant: en_US_POSIX
+    #     does anybody care about it at all?
+    if variant_code:
+        raise xpathlite.Error('we do not support variants ("%s")' % variant_code)
+
+    language_id = enumdata.languageCodeToId(language_code)
+    if language_id <= 0:
+        raiseUnknownCode(language_code, 'language')
+
+    script_id = enumdata.scriptCodeToId(script_code)
+    if script_id == -1:
+        raiseUnknownCode(script_code, 'script')
+
+    # we should handle fully qualified names with the territory
+    if not country_code:
+        return {}
+    country_id = enumdata.countryCodeToId(country_code)
+    if country_id <= 0:
+        raiseUnknownCode(country_code, 'country')
+
+    # So we say we accept only those values that have "contributed" or
+    # "approved" resolution. see http://www.unicode.org/cldr/process.html
+    # But we only respect the resolution for new datas for backward
+    # compatibility.
+    draft = DraftResolution.contributed
+
+    result = dict(
+        language=enumdata.language_list[language_id][0],
+        language_code=language_code, language_id=language_id,
+        script=enumdata.script_list[script_id][0],
+        script_code=script_code, script_id=script_id,
+        country=enumdata.country_list[country_id][0],
+        country_code=country_code, country_id=country_id,
+        variant_code=variant_code)
+
+    (dir_name, file_name) = os.path.split(path)
+    def from_supplement(tag,
+                        path=os.path.join(dir_name, '..', 'supplemental',
+                                          'supplementalData.xml')):
+        return findTagsInFile(path, tag)
+    currencies = from_supplement('currencyData/region[iso3166=%s]' % country_code)
+    result['currencyIsoCode'] = ''
+    result['currencyDigits'] = 2
+    result['currencyRounding'] = 1
+    if currencies:
+        for e in currencies:
+            if e[0] == 'currency':
+                t = [x[1] == 'false' for x in e[1] if x[0] == 'tender']
+                if t and t[0]:
+                    pass
+                elif not any(x[0] == 'to' for x in e[1]):
+                    result['currencyIsoCode'] = (x[1] for x in e[1] if x[0] == 'iso4217').next()
+                    break
+        if result['currencyIsoCode']:
+            t = from_supplement("currencyData/fractions/info[iso4217=%s]"
+                                % result['currencyIsoCode'])
+            if t and t[0][0] == 'info':
+                result['currencyDigits'] = (int(x[1]) for x in t[0][1] if x[0] == 'digits').next()
+                result['currencyRounding'] = (int(x[1]) for x in t[0][1] if x[0] == 'rounding').next()
+    numbering_system = None
+    try:
+        numbering_system = findEntry(path, "numbers/defaultNumberingSystem")
+    except xpathlite.Error:
+        pass
+    def findEntryDef(path, xpath, value=''):
+        try:
+            return findEntry(path, xpath)
+        except xpathlite.Error:
+            return value
+    def get_number_in_system(path, xpath, numbering_system):
+        if numbering_system:
+            try:
+                return findEntry(path, xpath + "[numberSystem=" + numbering_system + "]")
+            except xpathlite.Error:
+                # in CLDR 1.9 number system was refactored for numbers (but not for currency)
+                # so if previous findEntry doesn't work we should try this:
+                try:
+                    return findEntry(path, xpath.replace("/symbols/", "/symbols[numberSystem=" + numbering_system + "]/"))
+                except xpathlite.Error:
+                    # fallback to default
+                    pass
+        return findEntry(path, xpath)
+
+    result['decimal'] = get_number_in_system(path, "numbers/symbols/decimal", numbering_system)
+    result['group'] = get_number_in_system(path, "numbers/symbols/group", numbering_system)
+    result['list'] = get_number_in_system(path, "numbers/symbols/list", numbering_system)
+    result['percent'] = get_number_in_system(path, "numbers/symbols/percentSign", numbering_system)
+    try:
+        result['zero'] = getNumberSystems()[numbering_system][u"digits"][0]
+    except Exception as e:
+        sys.stderr.write("Native zero detection problem: %s\n" % repr(e))
+        result['zero'] = get_number_in_system(path, "numbers/symbols/nativeZeroDigit", numbering_system)
+    result['minus'] = get_number_in_system(path, "numbers/symbols/minusSign", numbering_system)
+    result['plus'] = get_number_in_system(path, "numbers/symbols/plusSign", numbering_system)
+    result['exp'] = get_number_in_system(path, "numbers/symbols/exponential", numbering_system).lower()
+    result['quotationStart'] = findEntry(path, "delimiters/quotationStart")
+    result['quotationEnd'] = findEntry(path, "delimiters/quotationEnd")
+    result['alternateQuotationStart'] = findEntry(path, "delimiters/alternateQuotationStart")
+    result['alternateQuotationEnd'] = findEntry(path, "delimiters/alternateQuotationEnd")
+    result['listPatternPartStart'] = parse_list_pattern_part_format(findEntry(path, "listPatterns/listPattern/listPatternPart[start]"))
+    result['listPatternPartMiddle'] = parse_list_pattern_part_format(findEntry(path, "listPatterns/listPattern/listPatternPart[middle]"))
+    result['listPatternPartEnd'] = parse_list_pattern_part_format(findEntry(path, "listPatterns/listPattern/listPatternPart[end]"))
+    result['listPatternPartTwo'] = parse_list_pattern_part_format(findEntry(path, "listPatterns/listPattern/listPatternPart[2]"))
+    result['am'] = findEntry(path, "dates/calendars/calendar[gregorian]/dayPeriods/dayPeriodContext[format]/dayPeriodWidth[wide]/dayPeriod[am]", draft)
+    result['pm'] = findEntry(path, "dates/calendars/calendar[gregorian]/dayPeriods/dayPeriodContext[format]/dayPeriodWidth[wide]/dayPeriod[pm]", draft)
+    result['longDateFormat'] = convert_date(findEntry(path, "dates/calendars/calendar[gregorian]/dateFormats/dateFormatLength[full]/dateFormat/pattern"))
+    result['shortDateFormat'] = convert_date(findEntry(path, "dates/calendars/calendar[gregorian]/dateFormats/dateFormatLength[short]/dateFormat/pattern"))
+    result['longTimeFormat'] = convert_date(findEntry(path, "dates/calendars/calendar[gregorian]/timeFormats/timeFormatLength[full]/timeFormat/pattern"))
+    result['shortTimeFormat'] = convert_date(findEntry(path, "dates/calendars/calendar[gregorian]/timeFormats/timeFormatLength[short]/timeFormat/pattern"))
+
+    endonym = None
+    if country_code and script_code:
+        endonym = findEntryDef(path, "localeDisplayNames/languages/language[type=%s_%s_%s]" % (language_code, script_code, country_code))
+    if not endonym and script_code:
+        endonym = findEntryDef(path, "localeDisplayNames/languages/language[type=%s_%s]" % (language_code, script_code))
+    if not endonym and country_code:
+        endonym = findEntryDef(path, "localeDisplayNames/languages/language[type=%s_%s]" % (language_code, country_code))
+    if not endonym:
+        endonym = findEntryDef(path, "localeDisplayNames/languages/language[type=%s]" % (language_code))
+    result['language_endonym'] = endonym
+    result['country_endonym'] = findEntryDef(path, "localeDisplayNames/territories/territory[type=%s]" % (country_code))
+
+    currency_format = get_number_in_system(path, "numbers/currencyFormats/currencyFormatLength/currencyFormat/pattern", numbering_system)
+    currency_format = parse_number_format(currency_format, result)
+    result['currencyFormat'] = currency_format[0]
+    result['currencyNegativeFormat'] = ''
+    if len(currency_format) > 1:
+        result['currencyNegativeFormat'] = currency_format[1]
+
+    result['currencySymbol'] = ''
+    result['currencyDisplayName'] = ''
+    if result['currencyIsoCode']:
+        result['currencySymbol'] = findEntryDef(path, "numbers/currencies/currency[%s]/symbol" % result['currencyIsoCode'])
+        result['currencyDisplayName'] = ';'.join(
+            findEntryDef(path, 'numbers/currencies/currency[' + result['currencyIsoCode']
+                         + ']/displayName' + tail)
+            for tail in ['',] + [
+                '[count=%s]' % x for x in ('zero', 'one', 'two', 'few', 'many', 'other')
+                ]) + ';'
+
+    def findUnitDef(path, stem, fallback=''):
+        # The displayName for a quantified unit in en.xml is kByte
+        # instead of kB (etc.), so prefer any unitPattern provided:
+        for count in ('many', 'few', 'two', 'other', 'zero', 'one'):
+            try:
+                ans = findEntry(path, stem + 'unitPattern[count=%s]' % count)
+            except xpathlite.Error:
+                continue
+
+            # TODO: epxloit count-handling, instead of discarding placeholders
+            if ans.startswith('{0}'):
+                ans = ans[3:].lstrip()
+            if ans:
+                return ans
+
+        return findEntryDef(path, stem + 'displayName', fallback)
+
+    # First without quantifier, then quantified each way:
+    result['byte_unit'] = findEntryDef(
+        path, 'units/unitLength[type=long]/unit[type=digital-byte]/displayName',
+        'bytes')
+    stem = 'units/unitLength[type=short]/unit[type=digital-%sbyte]/'
+    known = [] # cases where we *do* have a given version:
+    result['byte_si_quantified'] = ';'.join(unit_quantifiers(findUnitDef, path, stem, 'B', known))
+    # IEC 60027-2
+    # http://physics.nist.gov/cuu/Units/binary.html
+    result['byte_iec_quantified'] = ';'.join(unit_quantifiers(findUnitDef, path, stem % '%sbi', 'iB', known))
+
+    # Used for month and day data:
+    namings = (
+        ('standaloneLong', 'stand-alone', 'wide'),
+        ('standaloneShort', 'stand-alone', 'abbreviated'),
+        ('standaloneNarrow', 'stand-alone', 'narrow'),
+        ('long', 'format', 'wide'),
+        ('short', 'format', 'abbreviated'),
+        ('narrow', 'format', 'narrow'),
+        )
+
+    # Month names for 12-month calendars:
+    for cal in calendars:
+        stem = 'dates/calendars/calendar[' + cal + ']/months/'
+        for (key, mode, size) in namings:
+            prop = 'monthContext[' + mode + ']/monthWidth[' + size + ']/'
+            result[key + 'Months_' + cal] = ';'.join(
+                findEntry(path, stem + prop + "month[%d]" % i)
+                for i in range(1, 13)) + ';'
+
+    # Day data (for Gregorian, at least):
+    stem = 'dates/calendars/calendar[gregorian]/days/'
+    days = ('sun', 'mon', 'tue', 'wed', 'thu', 'fri', 'sat')
+    for (key, mode, size) in namings:
+        prop = 'dayContext[' + mode + ']/dayWidth[' + size + ']/day'
+        result[key + 'Days'] = ';'.join(
+            findEntry(path, stem + prop + '[' + day + ']')
+            for day in days) + ';'
+
+    return Locale(result)
+
+def addEscapes(s):
+    result = ''
+    for c in s:
+        n = ord(c)
+        if n < 128:
+            result += c
+        else:
+            result += "\\x"
+            result += "%02x" % (n)
+    return result
+
+def unicodeStr(s):
+    utf8 = s.encode('utf-8')
+    return "<size>" + str(len(utf8)) + "</size><data>" + addEscapes(utf8) + "</data>"
+
+def usage():
+    print "Usage: cldr2qlocalexml.py <path-to-cldr-main>"
+    sys.exit()
+
+def integrateWeekData(filePath):
+    if not filePath.endswith(".xml"):
+        return {}
+
+    def lookup(key):
+        return findEntryInFile(filePath, key, attribute='territories')[0].split()
+    days = ('mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun')
+
+    firstDayByCountryCode = {}
+    for day in days:
+        for countryCode in lookup('weekData/firstDay[day=%s]' % day):
+            firstDayByCountryCode[countryCode] = day
+
+    weekendStartByCountryCode = {}
+    for day in days:
+        for countryCode in lookup('weekData/weekendStart[day=%s]' % day):
+            weekendStartByCountryCode[countryCode] = day
+
+    weekendEndByCountryCode = {}
+    for day in days:
+        for countryCode in lookup('weekData/weekendEnd[day=%s]' % day):
+            weekendEndByCountryCode[countryCode] = day
+
+    for (key, locale) in locale_database.iteritems():
+        countryCode = locale.country_code
+        if countryCode in firstDayByCountryCode:
+            locale.firstDayOfWeek = firstDayByCountryCode[countryCode]
+        else:
+            locale.firstDayOfWeek = firstDayByCountryCode["001"]
+
+        if countryCode in weekendStartByCountryCode:
+            locale.weekendStart = weekendStartByCountryCode[countryCode]
+        else:
+            locale.weekendStart = weekendStartByCountryCode["001"]
+
+        if countryCode in weekendEndByCountryCode:
+            locale.weekendEnd = weekendEndByCountryCode[countryCode]
+        else:
+            locale.weekendEnd = weekendEndByCountryCode["001"]
+
+def splitLocale(name):
+    """Split name into (language, script, territory) triple as generator.
+
+    Ignores any trailing fields (with a warning), leaves script (a capitalised
+    four-letter token) or territory (either a number or an all-uppercase token)
+    empty if unspecified, returns a single-entry generator if name is a single
+    tag (i.e. contains no underscores).  Always yields 1 or 3 values, never 2."""
+    tags = iter(name.split('_'))
+    yield tags.next() # Language
+    tag = tags.next()
+
+    # Script is always four letters, always capitalised:
+    if len(tag) == 4 and tag[0].isupper() and tag[1:].islower():
+        yield tag
+        try:
+            tag = tags.next()
+        except StopIteration:
+            tag = ''
+    else:
+        yield ''
+
+    # Territory is upper-case or numeric:
+    if tag and tag.isupper() or tag.isdigit():
+        yield tag
+        tag = ''
+    else:
+        yield ''
+
+    # If nothing is left, StopIteration will avoid the warning:
+    tag = (tag if tag else tags.next(),)
+    sys.stderr.write('Ignoring unparsed cruft %s in %s\n' % ('_'.join(tag + tuple(tags)), name))
+
+if len(sys.argv) != 2:
+    usage()
+
+cldr_dir = sys.argv[1]
+
+if not os.path.isdir(cldr_dir):
+    usage()
+
+cldr_files = os.listdir(cldr_dir)
+
+locale_database = {}
+
+# see http://www.unicode.org/reports/tr35/tr35-info.html#Default_Content
+defaultContent_locales = []
+for ns in findTagsInFile(os.path.join(cldr_dir, '..', 'supplemental',
+                                      'supplementalMetadata.xml'),
+                         'metadata/defaultContent'):
+    for data in ns[1:][0]:
+        if data[0] == u"locales":
+            defaultContent_locales += data[1].split()
+
+skips = []
+for file in defaultContent_locales:
+    try:
+        language_code, script_code, country_code = splitLocale(file)
+    except ValueError:
+        sys.stderr.write('skipping defaultContent locale "' + file + '" [neither two nor three tags]\n')
+        continue
+
+    if not (script_code or country_code):
+        sys.stderr.write('skipping defaultContent locale "' + file + '" [second tag is neither script nor territory]\n')
+        continue
+
+    try:
+        l = _generateLocaleInfo(cldr_dir + "/" + file + ".xml", language_code, script_code, country_code)
+        if not l:
+            skips.append(file)
+            continue
+    except xpathlite.Error as e:
+        sys.stderr.write('skipping defaultContent locale "%s" (%s)\n' % (file, str(e)))
+        continue
+
+    locale_database[(l.language_id, l.script_id, l.country_id, l.variant_code)] = l
+
+if skips:
+    wrappedwarn('skipping defaultContent locales [no locale info generated]: ', skips)
+    skips = []
+
+for file in cldr_files:
+    try:
+        l = generateLocaleInfo(cldr_dir + "/" + file)
+        if not l:
+            skips.append(file)
+            continue
+    except xpathlite.Error as e:
+        sys.stderr.write('skipping file "%s" (%s)\n' % (file, str(e)))
+        continue
+
+    locale_database[(l.language_id, l.script_id, l.country_id, l.variant_code)] = l
+
+if skips:
+    wrappedwarn('skipping files [no locale info generated]: ', skips)
+
+integrateWeekData(cldr_dir+"/../supplemental/supplementalData.xml")
+locale_keys = locale_database.keys()
+locale_keys.sort()
+
+cldr_version = 'unknown'
+ldml = open(cldr_dir+"/../dtd/ldml.dtd", "r")
+for line in ldml:
+    if 'version cldrVersion CDATA #FIXED' in line:
+        cldr_version = line.split('"')[1]
+
+print "<localeDatabase>"
+print "    <version>" + cldr_version + "</version>"
+print "    <languageList>"
+for id in enumdata.language_list:
+    l = enumdata.language_list[id]
+    print "        <language>"
+    print "            <name>" + l[0] + "</name>"
+    print "            <id>" + str(id) + "</id>"
+    print "            <code>" + l[1] + "</code>"
+    print "        </language>"
+print "    </languageList>"
+
+print "    <scriptList>"
+for id in enumdata.script_list:
+    l = enumdata.script_list[id]
+    print "        <script>"
+    print "            <name>" + l[0] + "</name>"
+    print "            <id>" + str(id) + "</id>"
+    print "            <code>" + l[1] + "</code>"
+    print "        </script>"
+print "    </scriptList>"
+
+print "    <countryList>"
+for id in enumdata.country_list:
+    l = enumdata.country_list[id]
+    print "        <country>"
+    print "            <name>" + l[0] + "</name>"
+    print "            <id>" + str(id) + "</id>"
+    print "            <code>" + l[1] + "</code>"
+    print "        </country>"
+print "    </countryList>"
+
+def _parseLocale(l):
+    language = "AnyLanguage"
+    script = "AnyScript"
+    country = "AnyCountry"
+
+    if l == "und":
+        raise xpathlite.Error("we are treating unknown locale like C")
+
+    parsed = splitLocale(l)
+    language_code = parsed.next()
+    script_code = country_code = ''
+    try:
+        script_code, country_code = parsed
+    except ValueError:
+        pass
+
+    if language_code != "und":
+        language_id = enumdata.languageCodeToId(language_code)
+        if language_id == -1:
+            raise xpathlite.Error('unknown language code "%s"' % language_code)
+        language = enumdata.language_list[language_id][0]
+
+    if script_code:
+        script_id = enumdata.scriptCodeToId(script_code)
+        if script_id == -1:
+            raise xpathlite.Error('unknown script code "%s"' % script_code)
+        script = enumdata.script_list[script_id][0]
+
+    if country_code:
+        country_id = enumdata.countryCodeToId(country_code)
+        if country_id == -1:
+            raise xpathlite.Error('unknown country code "%s"' % country_code)
+        country = enumdata.country_list[country_id][0]
+
+    return (language, script, country)
+
+skips = []
+print "    <likelySubtags>"
+for ns in findTagsInFile(cldr_dir + "/../supplemental/likelySubtags.xml", "likelySubtags"):
+    tmp = {}
+    for data in ns[1:][0]: # ns looks like this: [u'likelySubtag', [(u'from', u'aa'), (u'to', u'aa_Latn_ET')]]
+        tmp[data[0]] = data[1]
+
+    try:
+        from_language, from_script, from_country = _parseLocale(tmp[u"from"])
+        to_language, to_script, to_country = _parseLocale(tmp[u"to"])
+    except xpathlite.Error as e:
+        if tmp[u'to'].startswith(tmp[u'from']) and str(e) == 'unknown language code "%s"' % tmp[u'from']:
+            skips.append(tmp[u'to'])
+        else:
+            sys.stderr.write('skipping likelySubtag "%s" -> "%s" (%s)\n' % (tmp[u"from"], tmp[u"to"], str(e)))
+        continue
+    # substitute according to http://www.unicode.org/reports/tr35/#Likely_Subtags
+    if to_country == "AnyCountry" and from_country != to_country:
+        to_country = from_country
+    if to_script == "AnyScript" and from_script != to_script:
+        to_script = from_script
+
+    print "        <likelySubtag>"
+    print "            <from>"
+    print "                <language>" + from_language + "</language>"
+    print "                <script>" + from_script + "</script>"
+    print "                <country>" + from_country + "</country>"
+    print "            </from>"
+    print "            <to>"
+    print "                <language>" + to_language + "</language>"
+    print "                <script>" + to_script + "</script>"
+    print "                <country>" + to_country + "</country>"
+    print "            </to>"
+    print "        </likelySubtag>"
+print "    </likelySubtags>"
+if skips:
+    wrappedwarn('skipping likelySubtags (for unknown language codes): ', skips)
+print "    <localeList>"
+
+Locale.C(calendars).toXml(calendars)
+for key in locale_keys:
+    locale_database[key].toXml(calendars)
+
+print "    </localeList>"
+print "</localeDatabase>"