summaryrefslogtreecommitdiffstats
path: root/util/locale_database/cldr2qlocalexml.py
diff options
context:
space:
mode:
Diffstat (limited to 'util/locale_database/cldr2qlocalexml.py')
-rwxr-xr-xutil/locale_database/cldr2qlocalexml.py705
1 files changed, 64 insertions, 641 deletions
diff --git a/util/locale_database/cldr2qlocalexml.py b/util/locale_database/cldr2qlocalexml.py
index 7f98e29d47..c05cabf520 100755
--- a/util/locale_database/cldr2qlocalexml.py
+++ b/util/locale_database/cldr2qlocalexml.py
@@ -31,15 +31,17 @@
The CLDR data can be downloaded from CLDR_, which has a sub-directory
for each version; you need the ``core.zip`` file for your version of
-choice (typically the latest). This script has had updates to cope up
-to v35; for later versions, we may need adaptations. Unpack the
+choice (typically the latest). This script has had updates to cope up
+to v35; for later versions, we may need adaptations. Unpack the
downloaded ``core.zip`` and check it has a common/main/ sub-directory:
-pass the path of that sub-directory to this script as its single
-command-line argument. Save its standard output (but not error) to a
-file for later processing by ``./qlocalexml2cpp.py``
+pass the path of that root of the download to this script as its first
+command-line argument. Pass the name of the file in which to write
+output as the second argument; either omit it or use '-' to select the
+standard output. This file is the input needed by
+``./qlocalexml2cpp.py``
When you update the CLDR data, be sure to also update
-src/corelib/text/qt_attribution.json's entry for unicode-cldr. Check
+src/corelib/text/qt_attribution.json's entry for unicode-cldr. Check
this script's output for unknown language, country or script messages;
if any can be resolved, use their entry in common/main/en.xml to
append new entries to enumdata.py's lists and update documentation in
@@ -54,646 +56,67 @@ time zone names; see cldr2qtimezone.py for details.
import os
import sys
-import re
-import textwrap
-import enumdata
-import xpathlite
-from xpathlite import DraftResolution, findAlias, findEntry, findTagsInFile
-from dateconverter import convert_date
-from qlocalexml import Locale
-
-# TODO: make calendars a command-line option
-calendars = ['gregorian', 'persian', 'islamic'] # 'hebrew'
-findEntryInFile = xpathlite._findEntryInFile
-def wrappedwarn(prefix, tokens):
- return sys.stderr.write(
- '\n'.join(textwrap.wrap(prefix + ', '.join(tokens),
- subsequent_indent=' ', width=80)) + '\n')
-
-def parse_number_format(patterns, data):
- # this is a very limited parsing of the number format for currency only.
- def skip_repeating_pattern(x):
- p = x.replace('0', '#').replace(',', '').replace('.', '')
- seen = False
- result = ''
- for c in p:
- if c == '#':
- if seen:
- continue
- seen = True
- else:
- seen = False
- result = result + c
- return result
- patterns = patterns.split(';')
- result = []
- for pattern in patterns:
- pattern = skip_repeating_pattern(pattern)
- pattern = pattern.replace('#', "%1")
- # according to http://www.unicode.org/reports/tr35/#Number_Format_Patterns
- # there can be doubled or trippled currency sign, however none of the
- # locales use that.
- pattern = pattern.replace(u'\xa4', "%2")
- pattern = pattern.replace("''", "###").replace("'", '').replace("###", "'")
- pattern = pattern.replace('-', data['minus'])
- pattern = pattern.replace('+', data['plus'])
- result.append(pattern)
- return result
-
-def raiseUnknownCode(code, form, cache={}):
- """Check whether an unknown code could be supported.
-
- We declare a language, script or country code unknown if it's not
- known to enumdata.py; however, if it's present in main/en.xml's
- mapping of codes to names, we have the option of adding support.
- This caches the necessary look-up (so we only read main/en.xml
- once) and returns the name we should use if we do add support.
-
- First parameter, code, is the unknown code. Second parameter,
- form, is one of 'language', 'script' or 'country' to select the
- type of code to look up. Do not pass further parameters (the next
- will deprive you of the cache).
-
- Raises xpathlite.Error with a suitable message, that includes the
- unknown code's full name if found.
-
- Relies on global cldr_dir being set before it's called; see tail
- of this file.
- """
- if not cache:
- cache.update(xpathlite.codeMapsFromFile(os.path.join(cldr_dir, 'en.xml')))
- name = cache[form].get(code)
- msg = 'unknown %s code "%s"' % (form, code)
- if name:
- msg += ' - could use "%s"' % name
- raise xpathlite.Error(msg)
-
-def parse_list_pattern_part_format(pattern):
- # This is a very limited parsing of the format for list pattern part only.
- return pattern.replace("{0}", "%1").replace("{1}", "%2").replace("{2}", "%3")
-
-def unit_quantifiers(find, path, stem, suffix, known,
- # Stop at exa/exbi: 16 exbi = 2^{64} < zetta =
- # 1000^7 < zebi = 2^{70}, the next quantifiers up:
- si_quantifiers = ('kilo', 'mega', 'giga', 'tera', 'peta', 'exa')):
- """Work out the unit quantifiers.
-
- Unfortunately, the CLDR data only go up to terabytes and we want
- all the way to exabytes; but we can recognize the SI quantifiers
- as prefixes, strip and identify the tail as the localized
- translation for 'B' (e.g. French has 'octet' for 'byte' and uses
- ko, Mo, Go, To from which we can extrapolate Po, Eo).
-
- Should be called first for the SI quantifiers, with suffix = 'B',
- then for the IEC ones, with suffix = 'iB'; the list known
- (initially empty before first call) is used to let the second call
- know what the first learned about the localized unit.
- """
- if suffix == 'B': # first call, known = []
- tail = suffix
- for q in si_quantifiers:
- it = find(path, stem % q)
- # kB for kilobyte, in contrast with KiB for IEC:
- q = q[0] if q == 'kilo' else q[0].upper()
- if not it:
- it = q + tail
- elif it.startswith(q):
- rest = it[1:]
- tail = rest if all(rest == k for k in known) else suffix
- known.append(rest)
- yield it
- else: # second call, re-using first's known
- assert suffix == 'iB'
- if known:
- byte = known.pop()
- if all(byte == k for k in known):
- suffix = 'i' + byte
- for q in si_quantifiers:
- yield find(path, stem % q[:2],
- # Those don't (yet, v31) exist in CLDR, so we always fall back to:
- q[0].upper() + suffix)
-
-def generateLocaleInfo(path):
- if not path.endswith(".xml"):
- return {}
-
- # skip legacy/compatibility ones
- alias = findAlias(path)
- if alias:
- raise xpathlite.Error('alias to "%s"' % alias)
-
- def code(tag):
- return findEntryInFile(path, 'identity/' + tag, attribute="type")[0]
-
- return _generateLocaleInfo(path, code('language'), code('script'),
- code('territory'), code('variant'))
-
-def getNumberSystems(cache={}):
- """Cached look-up of number system information.
-
- Pass no arguments. Returns a mapping from number system names to,
- for each system, a mapping with keys u'digits', u'type' and
- u'id'\n"""
- if not cache:
- for ns in findTagsInFile(os.path.join(cldr_dir, '..', 'supplemental',
- 'numberingSystems.xml'),
- 'numberingSystems'):
- # ns has form: [u'numberingSystem', [(u'digits', u'0123456789'), (u'type', u'numeric'), (u'id', u'latn')]]
- entry = dict(ns[1])
- cache[entry[u'id']] = entry
- return cache
-
-def _generateLocaleInfo(path, language_code, script_code, country_code, variant_code=""):
- if not path.endswith(".xml"):
- return {}
-
- if language_code == 'root':
- # just skip it
- return {}
-
- # we do not support variants
- # ### actually there is only one locale with variant: en_US_POSIX
- # does anybody care about it at all?
- if variant_code:
- raise xpathlite.Error('we do not support variants ("%s")' % variant_code)
-
- language_id = enumdata.languageCodeToId(language_code)
- if language_id <= 0:
- raiseUnknownCode(language_code, 'language')
-
- script_id = enumdata.scriptCodeToId(script_code)
- if script_id == -1:
- raiseUnknownCode(script_code, 'script')
-
- # we should handle fully qualified names with the territory
- if not country_code:
- return {}
- country_id = enumdata.countryCodeToId(country_code)
- if country_id <= 0:
- raiseUnknownCode(country_code, 'country')
-
- # So we say we accept only those values that have "contributed" or
- # "approved" resolution. see http://www.unicode.org/cldr/process.html
- # But we only respect the resolution for new datas for backward
- # compatibility.
- draft = DraftResolution.contributed
-
- result = dict(
- language=enumdata.language_list[language_id][0],
- language_code=language_code, language_id=language_id,
- script=enumdata.script_list[script_id][0],
- script_code=script_code, script_id=script_id,
- country=enumdata.country_list[country_id][0],
- country_code=country_code, country_id=country_id,
- variant_code=variant_code)
-
- (dir_name, file_name) = os.path.split(path)
- def from_supplement(tag,
- path=os.path.join(dir_name, '..', 'supplemental',
- 'supplementalData.xml')):
- return findTagsInFile(path, tag)
- currencies = from_supplement('currencyData/region[iso3166=%s]' % country_code)
- result['currencyIsoCode'] = ''
- result['currencyDigits'] = 2
- result['currencyRounding'] = 1
- if currencies:
- for e in currencies:
- if e[0] == 'currency':
- t = [x[1] == 'false' for x in e[1] if x[0] == 'tender']
- if t and t[0]:
- pass
- elif not any(x[0] == 'to' for x in e[1]):
- result['currencyIsoCode'] = (x[1] for x in e[1] if x[0] == 'iso4217').next()
- break
- if result['currencyIsoCode']:
- t = from_supplement("currencyData/fractions/info[iso4217=%s]"
- % result['currencyIsoCode'])
- if t and t[0][0] == 'info':
- result['currencyDigits'] = (int(x[1]) for x in t[0][1] if x[0] == 'digits').next()
- result['currencyRounding'] = (int(x[1]) for x in t[0][1] if x[0] == 'rounding').next()
- numbering_system = None
- try:
- numbering_system = findEntry(path, "numbers/defaultNumberingSystem")
- except xpathlite.Error:
- pass
- def findEntryDef(path, xpath, value=''):
- try:
- return findEntry(path, xpath)
- except xpathlite.Error:
- return value
- def get_number_in_system(path, xpath, numbering_system):
- if numbering_system:
- try:
- return findEntry(path, xpath + "[numberSystem=" + numbering_system + "]")
- except xpathlite.Error:
- # in CLDR 1.9 number system was refactored for numbers (but not for currency)
- # so if previous findEntry doesn't work we should try this:
- try:
- return findEntry(path, xpath.replace("/symbols/", "/symbols[numberSystem=" + numbering_system + "]/"))
- except xpathlite.Error:
- # fallback to default
- pass
- return findEntry(path, xpath)
-
- result['decimal'] = get_number_in_system(path, "numbers/symbols/decimal", numbering_system)
- result['group'] = get_number_in_system(path, "numbers/symbols/group", numbering_system)
- assert result['decimal'] != result['group']
- result['list'] = get_number_in_system(path, "numbers/symbols/list", numbering_system)
- result['percent'] = get_number_in_system(path, "numbers/symbols/percentSign", numbering_system)
- try:
- digits = getNumberSystems()[numbering_system][u"digits"];
- assert len(digits) == 10 and all(ord(d) - i == ord(digits[0]) for i, d in enumerate(digits))
- result['zero'] = digits[0]
- except Exception as e:
- sys.stderr.write("Native zero detection problem: %s\n" % repr(e))
- result['zero'] = get_number_in_system(path, "numbers/symbols/nativeZeroDigit", numbering_system)
- result['minus'] = get_number_in_system(path, "numbers/symbols/minusSign", numbering_system)
- result['plus'] = get_number_in_system(path, "numbers/symbols/plusSign", numbering_system)
- result['exp'] = get_number_in_system(path, "numbers/symbols/exponential", numbering_system)
- result['quotationStart'] = findEntry(path, "delimiters/quotationStart")
- result['quotationEnd'] = findEntry(path, "delimiters/quotationEnd")
- result['alternateQuotationStart'] = findEntry(path, "delimiters/alternateQuotationStart")
- result['alternateQuotationEnd'] = findEntry(path, "delimiters/alternateQuotationEnd")
- result['listPatternPartStart'] = parse_list_pattern_part_format(findEntry(path, "listPatterns/listPattern/listPatternPart[start]"))
- result['listPatternPartMiddle'] = parse_list_pattern_part_format(findEntry(path, "listPatterns/listPattern/listPatternPart[middle]"))
- result['listPatternPartEnd'] = parse_list_pattern_part_format(findEntry(path, "listPatterns/listPattern/listPatternPart[end]"))
- result['listPatternPartTwo'] = parse_list_pattern_part_format(findEntry(path, "listPatterns/listPattern/listPatternPart[2]"))
- result['am'] = findEntry(path, "dates/calendars/calendar[gregorian]/dayPeriods/dayPeriodContext[format]/dayPeriodWidth[wide]/dayPeriod[am]", draft)
- result['pm'] = findEntry(path, "dates/calendars/calendar[gregorian]/dayPeriods/dayPeriodContext[format]/dayPeriodWidth[wide]/dayPeriod[pm]", draft)
- result['longDateFormat'] = convert_date(findEntry(path, "dates/calendars/calendar[gregorian]/dateFormats/dateFormatLength[full]/dateFormat/pattern"))
- result['shortDateFormat'] = convert_date(findEntry(path, "dates/calendars/calendar[gregorian]/dateFormats/dateFormatLength[short]/dateFormat/pattern"))
- result['longTimeFormat'] = convert_date(findEntry(path, "dates/calendars/calendar[gregorian]/timeFormats/timeFormatLength[full]/timeFormat/pattern"))
- result['shortTimeFormat'] = convert_date(findEntry(path, "dates/calendars/calendar[gregorian]/timeFormats/timeFormatLength[short]/timeFormat/pattern"))
-
- endonym = None
- if country_code and script_code:
- endonym = findEntryDef(path, "localeDisplayNames/languages/language[type=%s_%s_%s]" % (language_code, script_code, country_code))
- if not endonym and script_code:
- endonym = findEntryDef(path, "localeDisplayNames/languages/language[type=%s_%s]" % (language_code, script_code))
- if not endonym and country_code:
- endonym = findEntryDef(path, "localeDisplayNames/languages/language[type=%s_%s]" % (language_code, country_code))
- if not endonym:
- endonym = findEntryDef(path, "localeDisplayNames/languages/language[type=%s]" % (language_code))
- result['languageEndonym'] = endonym
- result['countryEndonym'] = findEntryDef(path, "localeDisplayNames/territories/territory[type=%s]" % (country_code))
-
- currency_format = get_number_in_system(path, "numbers/currencyFormats/currencyFormatLength/currencyFormat/pattern", numbering_system)
- currency_format = parse_number_format(currency_format, result)
- result['currencyFormat'] = currency_format[0]
- result['currencyNegativeFormat'] = ''
- if len(currency_format) > 1:
- result['currencyNegativeFormat'] = currency_format[1]
-
- result['currencySymbol'] = ''
- result['currencyDisplayName'] = ''
- if result['currencyIsoCode']:
- stem = "numbers/currencies/currency[%s]/" % result['currencyIsoCode']
- result['currencySymbol'] = findEntryDef(path, stem + 'symbol')
- displays = tuple(findEntryDef(path, stem + 'displayName' + tail)
- for tail in ('',) + tuple(
- '[count=%s]' % x for x in ('zero', 'one', 'two',
- 'few', 'many', 'other')))
- while displays and not displays[-1]:
- displays = displays[:-1]
- result['currencyDisplayName'] = ';'.join(displays)
-
- def findUnitDef(path, stem, fallback=''):
- # The displayName for a quantified unit in en.xml is kByte
- # instead of kB (etc.), so prefer any unitPattern provided:
- for count in ('many', 'few', 'two', 'other', 'zero', 'one'):
- try:
- ans = findEntry(path, stem + 'unitPattern[count=%s]' % count)
- except xpathlite.Error:
- continue
-
- # TODO: epxloit count-handling, instead of discarding placeholders
- if ans.startswith('{0}'):
- ans = ans[3:].lstrip()
- if ans:
- return ans
-
- return findEntryDef(path, stem + 'displayName', fallback)
-
- # First without quantifier, then quantified each way:
- result['byte_unit'] = findEntryDef(
- path, 'units/unitLength[type=long]/unit[type=digital-byte]/displayName',
- 'bytes')
- stem = 'units/unitLength[type=short]/unit[type=digital-%sbyte]/'
- known = [] # cases where we *do* have a given version:
- result['byte_si_quantified'] = ';'.join(unit_quantifiers(findUnitDef, path, stem, 'B', known))
- # IEC 60027-2
- # http://physics.nist.gov/cuu/Units/binary.html
- result['byte_iec_quantified'] = ';'.join(unit_quantifiers(findUnitDef, path, stem % '%sbi', 'iB', known))
-
- # Used for month and day data:
- namings = (
- ('standaloneLong', 'stand-alone', 'wide'),
- ('standaloneShort', 'stand-alone', 'abbreviated'),
- ('standaloneNarrow', 'stand-alone', 'narrow'),
- ('long', 'format', 'wide'),
- ('short', 'format', 'abbreviated'),
- ('narrow', 'format', 'narrow'),
- )
-
- # Month names for 12-month calendars:
- for cal in calendars:
- stem = 'dates/calendars/calendar[' + cal + ']/months/'
- for (key, mode, size) in namings:
- prop = 'monthContext[' + mode + ']/monthWidth[' + size + ']/'
- result[key + 'Months_' + cal] = ';'.join(
- findEntry(path, stem + prop + "month[%d]" % i)
- for i in range(1, 13))
-
- # Day data (for Gregorian, at least):
- stem = 'dates/calendars/calendar[gregorian]/days/'
- days = ('sun', 'mon', 'tue', 'wed', 'thu', 'fri', 'sat')
- for (key, mode, size) in namings:
- prop = 'dayContext[' + mode + ']/dayWidth[' + size + ']/day'
- result[key + 'Days'] = ';'.join(
- findEntry(path, stem + prop + '[' + day + ']')
- for day in days)
-
- return Locale(result)
-
-def addEscapes(s):
- result = ''
- for c in s:
- n = ord(c)
- if n < 128:
- result += c
- else:
- result += "\\x"
- result += "%02x" % (n)
- return result
-
-def unicodeStr(s):
- utf8 = s.encode('utf-8')
- return "<size>" + str(len(utf8)) + "</size><data>" + addEscapes(utf8) + "</data>"
-
-def usage():
- print "Usage: cldr2qlocalexml.py <path-to-cldr-main>"
- sys.exit()
-
-def integrateWeekData(filePath):
- if not filePath.endswith(".xml"):
- return {}
-
- def lookup(key):
- return findEntryInFile(filePath, key, attribute='territories')[0].split()
- days = ('mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun')
-
- firstDayByCountryCode = {}
- for day in days:
- for countryCode in lookup('weekData/firstDay[day=%s]' % day):
- firstDayByCountryCode[countryCode] = day
-
- weekendStartByCountryCode = {}
- for day in days:
- for countryCode in lookup('weekData/weekendStart[day=%s]' % day):
- weekendStartByCountryCode[countryCode] = day
-
- weekendEndByCountryCode = {}
- for day in days:
- for countryCode in lookup('weekData/weekendEnd[day=%s]' % day):
- weekendEndByCountryCode[countryCode] = day
-
- for (key, locale) in locale_database.iteritems():
- countryCode = locale.country_code
- if countryCode in firstDayByCountryCode:
- locale.firstDayOfWeek = firstDayByCountryCode[countryCode]
- else:
- locale.firstDayOfWeek = firstDayByCountryCode["001"]
-
- if countryCode in weekendStartByCountryCode:
- locale.weekendStart = weekendStartByCountryCode[countryCode]
- else:
- locale.weekendStart = weekendStartByCountryCode["001"]
-
- if countryCode in weekendEndByCountryCode:
- locale.weekendEnd = weekendEndByCountryCode[countryCode]
- else:
- locale.weekendEnd = weekendEndByCountryCode["001"]
-
-def splitLocale(name):
- """Split name into (language, script, territory) triple as generator.
-
- Ignores any trailing fields (with a warning), leaves script (a capitalised
- four-letter token) or territory (either a number or an all-uppercase token)
- empty if unspecified, returns a single-entry generator if name is a single
- tag (i.e. contains no underscores). Always yields 1 or 3 values, never 2."""
- tags = iter(name.split('_'))
- yield tags.next() # Language
- tag = tags.next()
-
- # Script is always four letters, always capitalised:
- if len(tag) == 4 and tag[0].isupper() and tag[1:].islower():
- yield tag
- try:
- tag = tags.next()
- except StopIteration:
- tag = ''
- else:
- yield ''
-
- # Territory is upper-case or numeric:
- if tag and tag.isupper() or tag.isdigit():
- yield tag
- tag = ''
+from localetools import Error
+from cldr import CldrReader
+from qlocalexml import QLocaleXmlWriter
+from enumdata import language_list, script_list, country_list
+
+def usage(name, err, message = ''):
+ err.write("""Usage: {} path/to/cldr/common/main [out-file.xml]
+""".format(name)) # TODO: expand command-line, improve help message
+ if message:
+ err.write('\n' + message + '\n')
+
+def main(args, out, err):
+ # TODO: make calendars a command-line option
+ calendars = ['gregorian', 'persian', 'islamic'] # 'hebrew'
+
+ # TODO: make argument parsing more sophisticated
+ name = args.pop(0)
+ if not args:
+ usage(name, err, 'Where is your CLDR data tree ?')
+ return 1
+
+ root = args.pop(0)
+ if not os.path.exists(os.path.join(root, 'common', 'main', 'root.xml')):
+ usage(name, err,
+ 'First argument is the root of the CLDR tree: found no common/main/root.xml under '
+ + root)
+ return 1
+
+ xml = args.pop(0) if args else None
+ if not xml or xml == '-':
+ emit = out
+ elif not xml.endswith('.xml'):
+ usage(name, err, 'Please use a .xml extension on your output file name, not ' + xml)
+ return 1
else:
- yield ''
-
- # If nothing is left, StopIteration will avoid the warning:
- tag = (tag if tag else tags.next(),)
- sys.stderr.write('Ignoring unparsed cruft %s in %s\n' % ('_'.join(tag + tuple(tags)), name))
-
-if len(sys.argv) != 2:
- usage()
-
-cldr_dir = sys.argv[1]
-
-if not os.path.isdir(cldr_dir):
- usage()
-
-cldr_files = os.listdir(cldr_dir)
-
-locale_database = {}
-
-# see http://www.unicode.org/reports/tr35/tr35-info.html#Default_Content
-defaultContent_locales = []
-for ns in findTagsInFile(os.path.join(cldr_dir, '..', 'supplemental',
- 'supplementalMetadata.xml'),
- 'metadata/defaultContent'):
- for data in ns[1:][0]:
- if data[0] == u"locales":
- defaultContent_locales += data[1].split()
-
-skips = []
-for file in defaultContent_locales:
- try:
- language_code, script_code, country_code = splitLocale(file)
- except ValueError:
- sys.stderr.write('skipping defaultContent locale "' + file + '" [neither two nor three tags]\n')
- continue
-
- if not (script_code or country_code):
- sys.stderr.write('skipping defaultContent locale "' + file + '" [second tag is neither script nor territory]\n')
- continue
-
- try:
- l = _generateLocaleInfo(cldr_dir + "/" + file + ".xml", language_code, script_code, country_code)
- if not l:
- skips.append(file)
- continue
- except xpathlite.Error as e:
- sys.stderr.write('skipping defaultContent locale "%s" (%s)\n' % (file, str(e)))
- continue
-
- locale_database[(l.language_id, l.script_id, l.country_id, l.variant_code)] = l
-
-if skips:
- wrappedwarn('skipping defaultContent locales [no locale info generated]: ', skips)
- skips = []
-
-for file in cldr_files:
- try:
- l = generateLocaleInfo(cldr_dir + "/" + file)
- if not l:
- skips.append(file)
- continue
- except xpathlite.Error as e:
- sys.stderr.write('skipping file "%s" (%s)\n' % (file, str(e)))
- continue
-
- locale_database[(l.language_id, l.script_id, l.country_id, l.variant_code)] = l
-
-if skips:
- wrappedwarn('skipping files [no locale info generated]: ', skips)
-
-integrateWeekData(cldr_dir+"/../supplemental/supplementalData.xml")
-locale_keys = locale_database.keys()
-locale_keys.sort()
-
-cldr_version = 'unknown'
-ldml = open(cldr_dir+"/../dtd/ldml.dtd", "r")
-for line in ldml:
- if 'version cldrVersion CDATA #FIXED' in line:
- cldr_version = line.split('"')[1]
-
-if sys.stdout.encoding != 'UTF-8' or (sys.stdout.encoding is None and sys.getdefaultencoding() != 'UTF-8'):
- reload(sys) # Weirdly, this gets a richer sys module than the plain import got us !
- sys.setdefaultencoding('UTF-8')
-
-print "<localeDatabase>"
-print " <version>" + cldr_version + "</version>"
-print " <languageList>"
-for id in enumdata.language_list:
- l = enumdata.language_list[id]
- print " <language>"
- print " <name>" + l[0] + "</name>"
- print " <id>" + str(id) + "</id>"
- print " <code>" + l[1] + "</code>"
- print " </language>"
-print " </languageList>"
-
-print " <scriptList>"
-for id in enumdata.script_list:
- l = enumdata.script_list[id]
- print " <script>"
- print " <name>" + l[0] + "</name>"
- print " <id>" + str(id) + "</id>"
- print " <code>" + l[1] + "</code>"
- print " </script>"
-print " </scriptList>"
-
-print " <countryList>"
-for id in enumdata.country_list:
- l = enumdata.country_list[id]
- print " <country>"
- print " <name>" + l[0] + "</name>"
- print " <id>" + str(id) + "</id>"
- print " <code>" + l[1] + "</code>"
- print " </country>"
-print " </countryList>"
-
-def _parseLocale(l):
- language = "AnyLanguage"
- script = "AnyScript"
- country = "AnyCountry"
-
- if l == "und":
- raise xpathlite.Error("we are treating unknown locale like C")
-
- parsed = splitLocale(l)
- language_code = parsed.next()
- script_code = country_code = ''
- try:
- script_code, country_code = parsed
- except ValueError:
- pass
-
- if language_code != "und":
- language_id = enumdata.languageCodeToId(language_code)
- if language_id == -1:
- raise xpathlite.Error('unknown language code "%s"' % language_code)
- language = enumdata.language_list[language_id][0]
-
- if script_code:
- script_id = enumdata.scriptCodeToId(script_code)
- if script_id == -1:
- raise xpathlite.Error('unknown script code "%s"' % script_code)
- script = enumdata.script_list[script_id][0]
-
- if country_code:
- country_id = enumdata.countryCodeToId(country_code)
- if country_id == -1:
- raise xpathlite.Error('unknown country code "%s"' % country_code)
- country = enumdata.country_list[country_id][0]
+ try:
+ emit = open(xml, 'w')
+ except IOError as e:
+ usage(name, err, 'Failed to open "{}" to write output to it\n'.format(xml))
+ return 1
- return (language, script, country)
+ if args:
+ usage(name, err, 'Too many arguments - excess: ' + ' '.join(args))
+ return 1
-skips = []
-print " <likelySubtags>"
-for ns in findTagsInFile(cldr_dir + "/../supplemental/likelySubtags.xml", "likelySubtags"):
- tmp = {}
- for data in ns[1:][0]: # ns looks like this: [u'likelySubtag', [(u'from', u'aa'), (u'to', u'aa_Latn_ET')]]
- tmp[data[0]] = data[1]
+ if emit.encoding != 'UTF-8' or (emit.encoding is None and sys.getdefaultencoding() != 'UTF-8'):
+ reload(sys) # Weirdly, this gets a richer sys module than the plain import got us !
+ sys.setdefaultencoding('UTF-8')
- try:
- from_language, from_script, from_country = _parseLocale(tmp[u"from"])
- to_language, to_script, to_country = _parseLocale(tmp[u"to"])
- except xpathlite.Error as e:
- if tmp[u'to'].startswith(tmp[u'from']) and str(e) == 'unknown language code "%s"' % tmp[u'from']:
- skips.append(tmp[u'to'])
- else:
- sys.stderr.write('skipping likelySubtag "%s" -> "%s" (%s)\n' % (tmp[u"from"], tmp[u"to"], str(e)))
- continue
- # substitute according to http://www.unicode.org/reports/tr35/#Likely_Subtags
- if to_country == "AnyCountry" and from_country != to_country:
- to_country = from_country
- if to_script == "AnyScript" and from_script != to_script:
- to_script = from_script
+ # TODO - command line options to tune choice of grumble and whitter:
+ reader = CldrReader(root, err.write, err.write)
+ writer = QLocaleXmlWriter(emit.write)
- print " <likelySubtag>"
- print " <from>"
- print " <language>" + from_language + "</language>"
- print " <script>" + from_script + "</script>"
- print " <country>" + from_country + "</country>"
- print " </from>"
- print " <to>"
- print " <language>" + to_language + "</language>"
- print " <script>" + to_script + "</script>"
- print " <country>" + to_country + "</country>"
- print " </to>"
- print " </likelySubtag>"
-print " </likelySubtags>"
-if skips:
- wrappedwarn('skipping likelySubtags (for unknown language codes): ', skips)
-print " <localeList>"
+ writer.version(reader.root.cldrVersion)
+ writer.enumData(language_list, script_list, country_list)
+ writer.likelySubTags(reader.likelySubTags())
+ writer.locales(reader.readLocales(calendars), calendars)
-Locale.C(calendars).toXml(calendars)
-for key in locale_keys:
- locale_database[key].toXml(calendars)
+ writer.close()
+ return 0
-print " </localeList>"
-print "</localeDatabase>"
+if __name__ == '__main__':
+ sys.exit(main(sys.argv, sys.stdout, sys.stderr))