summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rwxr-xr-xutil/locale_database/cldr2qlocalexml.py312
-rw-r--r--util/locale_database/qlocalexml.py215
2 files changed, 330 insertions, 197 deletions
diff --git a/util/locale_database/cldr2qlocalexml.py b/util/locale_database/cldr2qlocalexml.py
index ee53381b22..fba8d7fdd5 100755
--- a/util/locale_database/cldr2qlocalexml.py
+++ b/util/locale_database/cldr2qlocalexml.py
@@ -61,13 +61,13 @@ import enumdata
import xpathlite
from xpathlite import DraftResolution, findAlias, findEntry, findTagsInFile
from dateconverter import convert_date
-from qlocalexml import Locale
+from qlocalexml import Locale, QLocaleXmlWriter
# TODO: make calendars a command-line option
calendars = ['gregorian', 'persian', 'islamic'] # 'hebrew'
findEntryInFile = xpathlite._findEntryInFile
-def wrappedwarn(prefix, tokens):
- return sys.stderr.write(
+def wrappedwarn(err, prefix, tokens):
+ return err.write(
'\n'.join(textwrap.wrap(prefix + ', '.join(tokens),
subsequent_indent=' ', width=80)) + '\n')
@@ -101,6 +101,7 @@ def parse_number_format(patterns, data):
result.append(pattern)
return result
+cldr_dir = None
def raiseUnknownCode(code, form, cache={}):
"""Check whether an unknown code could be supported.
@@ -193,8 +194,8 @@ def getNumberSystems(cache={}):
"""Cached look-up of number system information.
Pass no arguments. Returns a mapping from number system names to,
- for each system, a mapping with keys u'digits', u'type' and
- u'id'\n"""
+ for each system, a mapping with keys 'digits', 'type' and 'id'.
+ Relies on global cldr_dir being set before it's first called.\n"""
if not cache:
for ns in findTagsInFile(os.path.join(cldr_dir, '..', 'supplemental',
'numberingSystems.xml'),
@@ -419,26 +420,7 @@ def _generateLocaleInfo(path, language_code, script_code, country_code, variant_
return Locale(result)
-def addEscapes(s):
- result = ''
- for c in s:
- n = ord(c)
- if n < 128:
- result += c
- else:
- result += "\\x"
- result += "%02x" % (n)
- return result
-
-def unicodeStr(s):
- utf8 = s.encode('utf-8')
- return "<size>" + str(len(utf8)) + "</size><data>" + addEscapes(utf8) + "</data>"
-
-def usage():
- print "Usage: cldr2qlocalexml.py <path-to-cldr-main>"
- sys.exit()
-
-def integrateWeekData(filePath):
+def integrateWeekData(filePath, locale_database):
if not filePath.endswith(".xml"):
return {}
@@ -510,111 +492,6 @@ def splitLocale(name):
tag = (tag if tag else tags.next(),)
sys.stderr.write('Ignoring unparsed cruft %s in %s\n' % ('_'.join(tag + tuple(tags)), name))
-if len(sys.argv) != 2:
- usage()
-
-cldr_dir = sys.argv[1]
-
-if not os.path.isdir(cldr_dir):
- usage()
-
-cldr_files = os.listdir(cldr_dir)
-
-locale_database = {}
-
-# see http://www.unicode.org/reports/tr35/tr35-info.html#Default_Content
-defaultContent_locales = []
-for ns in findTagsInFile(os.path.join(cldr_dir, '..', 'supplemental',
- 'supplementalMetadata.xml'),
- 'metadata/defaultContent'):
- for data in ns[1:][0]:
- if data[0] == u"locales":
- defaultContent_locales += data[1].split()
-
-skips = []
-for file in defaultContent_locales:
- try:
- language_code, script_code, country_code = splitLocale(file)
- except ValueError:
- sys.stderr.write('skipping defaultContent locale "' + file + '" [neither two nor three tags]\n')
- continue
-
- if not (script_code or country_code):
- sys.stderr.write('skipping defaultContent locale "' + file + '" [second tag is neither script nor territory]\n')
- continue
-
- try:
- l = _generateLocaleInfo(cldr_dir + "/" + file + ".xml", language_code, script_code, country_code)
- if not l:
- skips.append(file)
- continue
- except xpathlite.Error as e:
- sys.stderr.write('skipping defaultContent locale "%s" (%s)\n' % (file, str(e)))
- continue
-
- locale_database[(l.language_id, l.script_id, l.country_id, l.variant_code)] = l
-
-if skips:
- wrappedwarn('skipping defaultContent locales [no locale info generated]: ', skips)
- skips = []
-
-for file in cldr_files:
- try:
- l = generateLocaleInfo(cldr_dir + "/" + file)
- if not l:
- skips.append(file)
- continue
- except xpathlite.Error as e:
- sys.stderr.write('skipping file "%s" (%s)\n' % (file, str(e)))
- continue
-
- locale_database[(l.language_id, l.script_id, l.country_id, l.variant_code)] = l
-
-if skips:
- wrappedwarn('skipping files [no locale info generated]: ', skips)
-
-integrateWeekData(cldr_dir+"/../supplemental/supplementalData.xml")
-locale_keys = locale_database.keys()
-locale_keys.sort()
-
-cldr_version = 'unknown'
-ldml = open(cldr_dir+"/../dtd/ldml.dtd", "r")
-for line in ldml:
- if 'version cldrVersion CDATA #FIXED' in line:
- cldr_version = line.split('"')[1]
-
-print "<localeDatabase>"
-print " <version>" + cldr_version + "</version>"
-print " <languageList>"
-for id in enumdata.language_list:
- l = enumdata.language_list[id]
- print " <language>"
- print " <name>" + l[0] + "</name>"
- print " <id>" + str(id) + "</id>"
- print " <code>" + l[1] + "</code>"
- print " </language>"
-print " </languageList>"
-
-print " <scriptList>"
-for id in enumdata.script_list:
- l = enumdata.script_list[id]
- print " <script>"
- print " <name>" + l[0] + "</name>"
- print " <id>" + str(id) + "</id>"
- print " <code>" + l[1] + "</code>"
- print " </script>"
-print " </scriptList>"
-
-print " <countryList>"
-for id in enumdata.country_list:
- l = enumdata.country_list[id]
- print " <country>"
- print " <name>" + l[0] + "</name>"
- print " <id>" + str(id) + "</id>"
- print " <code>" + l[1] + "</code>"
- print " </country>"
-print " </countryList>"
-
def _parseLocale(l):
language = "AnyLanguage"
script = "AnyScript"
@@ -651,48 +528,135 @@ def _parseLocale(l):
return (language, script, country)
-skips = []
-print " <likelySubtags>"
-for ns in findTagsInFile(cldr_dir + "/../supplemental/likelySubtags.xml", "likelySubtags"):
- tmp = {}
- for data in ns[1:][0]: # ns looks like this: [u'likelySubtag', [(u'from', u'aa'), (u'to', u'aa_Latn_ET')]]
- tmp[data[0]] = data[1]
+def likelySubtags(root, err):
+ skips = []
+ for ns in findTagsInFile(os.path.join(root, 'supplemental', 'likelySubtags.xml'), "likelySubtags"):
+ tmp = {}
+ for data in ns[1:][0]: # ns looks like this: [u'likelySubtag', [(u'from', u'aa'), (u'to', u'aa_Latn_ET')]]
+ tmp[data[0]] = data[1]
+
+ try:
+ from_language, from_script, from_country = _parseLocale(tmp[u"from"])
+ to_language, to_script, to_country = _parseLocale(tmp[u"to"])
+ except xpathlite.Error as e:
+ if tmp[u'to'].startswith(tmp[u'from']) and str(e) == 'unknown language code "%s"' % tmp[u'from']:
+ skips.append(tmp[u'to'])
+ else:
+ sys.stderr.write('skipping likelySubtag "%s" -> "%s" (%s)\n' % (tmp[u"from"], tmp[u"to"], str(e)))
+ continue
+ # substitute according to http://www.unicode.org/reports/tr35/#Likely_Subtags
+ if to_country == "AnyCountry" and from_country != to_country:
+ to_country = from_country
+ if to_script == "AnyScript" and from_script != to_script:
+ to_script = from_script
+
+ yield ((from_language, from_script, from_country),
+ (to_language, to_script, to_country))
+ if skips:
+ wrappedwarn(err, 'skipping likelySubtags (for unknown language codes): ', skips)
+
+def usage(err, name, message = ''):
+ err.write("""Usage: {} <path-to-cldr-main> [out-file.xml]
+""".format(name)) # TODO: expand
+ if message:
+ err.write('\n' + message + '\n')
+
+def main(args, out, err):
+ name = args.pop(0)
+
+ if len(args) < 1:
+ usage(err, name)
+ return 1
+
+ global cldr_dir
+ cldr_dir = args.pop(0)
+ if not os.path.isdir(cldr_dir):
+ usage(err, name, 'Where did you unpack the CLDR data files ?')
+ return 1
+
+ if len(args) > 1:
+ usage(err, name, 'Too many arguments passed')
+ return 1
+ if args:
+ qxml = open(args.pop(0), 'w')
+ else:
+ qxml = out
- try:
- from_language, from_script, from_country = _parseLocale(tmp[u"from"])
- to_language, to_script, to_country = _parseLocale(tmp[u"to"])
- except xpathlite.Error as e:
- if tmp[u'to'].startswith(tmp[u'from']) and str(e) == 'unknown language code "%s"' % tmp[u'from']:
- skips.append(tmp[u'to'])
- else:
- sys.stderr.write('skipping likelySubtag "%s" -> "%s" (%s)\n' % (tmp[u"from"], tmp[u"to"], str(e)))
- continue
- # substitute according to http://www.unicode.org/reports/tr35/#Likely_Subtags
- if to_country == "AnyCountry" and from_country != to_country:
- to_country = from_country
- if to_script == "AnyScript" and from_script != to_script:
- to_script = from_script
-
- print " <likelySubtag>"
- print " <from>"
- print " <language>" + from_language + "</language>"
- print " <script>" + from_script + "</script>"
- print " <country>" + from_country + "</country>"
- print " </from>"
- print " <to>"
- print " <language>" + to_language + "</language>"
- print " <script>" + to_script + "</script>"
- print " <country>" + to_country + "</country>"
- print " </to>"
- print " </likelySubtag>"
-print " </likelySubtags>"
-if skips:
- wrappedwarn('skipping likelySubtags (for unknown language codes): ', skips)
-print " <localeList>"
-
-Locale.C(calendars).toXml(calendars)
-for key in locale_keys:
- locale_database[key].toXml(calendars)
-
-print " </localeList>"
-print "</localeDatabase>"
+ getNumberSystems(cldr_dir)
+ cldr_files = os.listdir(cldr_dir)
+ locale_database = {}
+
+ # see http://www.unicode.org/reports/tr35/tr35-info.html#Default_Content
+ defaultContent_locales = []
+ for ns in findTagsInFile(os.path.join(cldr_dir, '..', 'supplemental',
+ 'supplementalMetadata.xml'),
+ 'metadata/defaultContent'):
+ for data in ns[1:][0]:
+ if data[0] == u"locales":
+ defaultContent_locales += data[1].split()
+
+ skips = []
+ for file in defaultContent_locales:
+ try:
+ language_code, script_code, country_code = splitLocale(file)
+ except ValueError:
+ sys.stderr.write('skipping defaultContent locale "' + file + '" [neither two nor three tags]\n')
+ continue
+
+ if not (script_code or country_code):
+ sys.stderr.write('skipping defaultContent locale "' + file + '" [second tag is neither script nor territory]\n')
+ continue
+
+ try:
+ l = _generateLocaleInfo(cldr_dir + "/" + file + ".xml", language_code, script_code, country_code)
+ if not l:
+ skips.append(file)
+ continue
+ except xpathlite.Error as e:
+ sys.stderr.write('skipping defaultContent locale "{}" ({})\n'.format(file, str(e)))
+ continue
+
+ locale_database[(l.language_id, l.script_id, l.country_id, l.variant_code)] = l
+
+ if skips:
+ wrappedwarn(err, 'skipping defaultContent locales [no locale info generated]: ', skips)
+ skips = []
+
+ for file in cldr_files:
+ try:
+ l = generateLocaleInfo(cldr_dir + "/" + file)
+ if not l:
+ skips.append(file)
+ continue
+ except xpathlite.Error as e:
+ sys.stderr.write('skipping file "{}" ({})\n'.format(file, str(e)))
+ continue
+
+ locale_database[(l.language_id, l.script_id, l.country_id, l.variant_code)] = l
+
+ if skips:
+ wrappedwarn(err, 'skipping files [no locale info generated]: ', skips)
+
+ integrateWeekData(cldr_dir + "/../supplemental/supplementalData.xml", locale_database)
+ cldr_version = 'unknown'
+ with open(cldr_dir+"/../dtd/ldml.dtd", "r") as ldml:
+ for line in ldml:
+ if 'version cldrVersion CDATA #FIXED' in line:
+ cldr_version = line.split('"')[1]
+
+ xmlOut = QLocaleXmlWriter(qxml.write)
+ xmlOut.version(cldr_version)
+ xmlOut.enumData(enumdata.language_list,
+ enumdata.script_list,
+ enumdata.country_list)
+ xmlOut.likelySubTags(likelySubtags(os.path.split(cldr_dir)[0], err))
+ xmlOut.locales(locale_database, calendars)
+ xmlOut.close()
+ if qxml is not out:
+ qxml.close()
+
+ return 0
+
+if __name__ == '__main__':
+ import sys
+ sys.exit(main(sys.argv, sys.stdout, sys.stderr))
diff --git a/util/locale_database/qlocalexml.py b/util/locale_database/qlocalexml.py
index 87e356b8a7..b64ff56c64 100644
--- a/util/locale_database/qlocalexml.py
+++ b/util/locale_database/qlocalexml.py
@@ -1,7 +1,7 @@
# coding=utf8
#############################################################################
##
-## Copyright (C) 2018 The Qt Company Ltd.
+## Copyright (C) 2020 The Qt Company Ltd.
## Contact: https://www.qt.io/licensing/
##
## This file is part of the test suite of the Qt Toolkit.
@@ -28,11 +28,17 @@
#############################################################################
"""Shared serialization-scanning code for QLocaleXML format.
-The Locale class is written by cldr2qlocalexml.py and read by qlocalexml2cpp.py
+Provides classes:
+ Locale -- common data-type representing one locale as a namespace
+ QLocaleXmlWriter -- helper to write a QLocaleXML file
+
+Support:
+ Spacer -- provides control over indentation of the output.
"""
+from __future__ import print_function
from xml.sax.saxutils import escape
-import xpathlite
+from xpathlite import Error
# Tools used by Locale:
def camel(seq):
@@ -43,10 +49,14 @@ def camel(seq):
def camelCase(words):
return ''.join(camel(iter(words)))
+def addEscapes(s):
+ return ''.join(c if n < 128 else '\\x{:02x}'.format(n)
+ for n, c in ((ord(c), c) for c in s))
+
def ordStr(c):
if len(c) == 1:
return str(ord(c))
- raise xpathlite.Error('Unable to handle value "%s"' % addEscapes(c))
+ raise Error('Unable to handle value "{}"'.format(addEscapes(c)))
# Fix for a problem with QLocale returning a character instead of
# strings for QLocale::exponential() and others. So we fallback to
@@ -69,6 +79,8 @@ def convertFormat(format):
* https://www.unicode.org/reports/tr35/tr35-dates.html#Date_Field_Symbol_Table
* QDateTimeParser::parseFormat() and QLocalePrivate::dateTimeToString()
"""
+ # Compare and contrast dateconverter.py's convert_date().
+ # Need to (check consistency and) reduce redundancy !
result = ""
i = 0
while i < len(format):
@@ -113,7 +125,163 @@ def convertFormat(format):
return result
-class Locale:
+class Spacer (object):
+ def __init__(self, indent = None, initial = ''):
+ """Prepare to manage indentation and line breaks.
+
+ Arguments are both optional.
+
+ First argument, indent, is either None (its default, for
+ 'minifying'), an ingeter (number of spaces) or the unit of
+ text that is to be used for each indentation level (e.g. '\t'
+ to use tabs). If indent is None, no indentation is added, nor
+ are line-breaks; otherwise, self(text), for non-empty text,
+ shall end with a newline and begin with indentation.
+
+ Second argument, initial, is the initial indentation; it is
+ ignored if indent is None. Indentation increases after each
+ call to self(text) in which text starts with a tag and doesn't
+ include its end-tag; indentation decreases if text starts with
+ an end-tag. The text is not parsed any more carefully than
+ just described.
+ """
+ if indent is None:
+ self.__call = lambda x: x
+ else:
+ self.__each = ' ' * indent if isinstance(indent, int) else indent
+ self.current = initial
+ self.__call = self.__wrap
+
+ def __wrap(self, line):
+ if not line:
+ return '\n'
+
+ indent = self.current
+ if line.startswith('</'):
+ indent = self.current = indent[:-len(self.__each)]
+ elif line.startswith('<') and not line.startswith('<!'):
+ cut = line.find('>')
+ tag = (line[1:] if cut < 0 else line[1 : cut]).strip().split()[0]
+ if '</{}>'.format(tag) not in line:
+ self.current += self.__each
+ return indent + line + '\n'
+
+ def __call__(self, line):
+ return self.__call(line)
+
+class QLocaleXmlWriter (object):
+ def __init__(self, save = None, space = Spacer(4)):
+ """Set up to write digested CLDR data as QLocale XML.
+
+ Arguments are both optional.
+
+ First argument, save, is None (its default) or a callable that
+ will write content to where you intend to save it. If None, it
+ is replaced with a callable that prints the given content,
+ suppressing the newline (but see the following); this is
+ equivalent to passing sys.stdout.write.
+
+ Second argument, space, is an object to call on each text
+ output to prepend indentation and append newlines, or not as
+ the case may be. The default is a Spacer(4), which grows
+ indent by four spaces after each unmatched new tag and shrinks
+ back on a close-tag (its parsing is naive, but adequate to how
+ this class uses it), while adding a newline to each line.
+ """
+ self.__rawOutput = self.__printit if save is None else save
+ self.__wrap = space
+ self.__write('<localeDatabase>')
+
+ # Output of various sections, in their usual order:
+ def enumData(self, languages, scripts, countries):
+ self.__enumTable('languageList', languages)
+ self.__enumTable('scriptList', scripts)
+ self.__enumTable('countryList', countries)
+
+ def likelySubTags(self, entries):
+ self.__openTag('likelySubtags')
+ for have, give in entries:
+ self.__openTag('likelySubtag')
+ self.__likelySubTag('from', have)
+ self.__likelySubTag('to', give)
+ self.__closeTag('likelySubtag')
+ self.__closeTag('likelySubtags')
+
+ def locales(self, locales, calendars):
+ self.__openTag('localeList')
+ self.__openTag('locale')
+ Locale.C(calendars).toXml(self.inTag, calendars)
+ self.__closeTag('locale')
+ keys = locales.keys()
+ keys.sort()
+ for key in keys:
+ self.__openTag('locale')
+ locales[key].toXml(self.inTag, calendars)
+ self.__closeTag('locale')
+ self.__closeTag('localeList')
+
+ def version(self, cldrVersion):
+ self.inTag('version', cldrVersion)
+
+ def inTag(self, tag, text):
+ self.__write('<{0}>{1}</{0}>'.format(tag, text))
+
+ def close(self):
+ if self.__rawOutput != self.__complain:
+ self.__write('</localeDatabase>')
+ self.__rawOutput = self.__complain
+
+ # Implementation details
+ @staticmethod
+ def __printit(text):
+ print(text, end='')
+ @staticmethod
+ def __complain(text):
+ raise Error('Attempted to write data after closing :-(')
+
+ def __enumTable(self, tag, table):
+ self.__openTag(tag)
+ for key, value in table.iteritems():
+ self.__openTag(tag[:-4])
+ self.inTag('name', value[0])
+ self.inTag('id', key)
+ self.inTag('code', value[1])
+ self.__closeTag(tag[:-4])
+ self.__closeTag(tag)
+
+ def __likelySubTag(self, tag, likely):
+ self.__openTag(tag)
+ self.inTag('language', likely[0])
+ self.inTag('script', likely[1])
+ self.inTag('country', likely[2])
+ # self.inTag('variant', likely[3])
+ self.__closeTag(tag)
+
+ def __openTag(self, tag):
+ self.__write('<{}>'.format(tag))
+ def __closeTag(self, tag):
+ self.__write('</{}>'.format(tag))
+
+ def __write(self, line):
+ self.__rawOutput(self.__wrap(line))
+
+class Locale (object):
+ """Holder for the assorted data representing one locale.
+
+ Implemented as a namespace; its constructor and update() have the
+ same signatures as those of a dict, acting on the instance's
+ __dict__, so the results are accessed as attributes rather than
+ mapping keys."""
+ def __init__(self, data=None, **kw):
+ self.update(data, **kw)
+
+ def update(self, data=None, **kw):
+ if data: self.__dict__.update(data)
+ if kw: self.__dict__.update(kw)
+
+ def __len__(self): # Used when testing as a boolean
+ return len(self.__dict__)
+
@staticmethod
def propsMonthDay(scale, lengths=('long', 'short', 'narrow')):
for L in lengths:
@@ -176,19 +344,26 @@ class Locale:
return cls(data)
- def toXml(self, calendars=('gregorian',), indent=' ', tab=' '):
- print indent + '<locale>'
- inner = indent + tab
+ def toXml(self, write, calendars=('gregorian',)):
+ """Writes its data as QLocale XML.
+
+ First argument, write, is a callable taking the name and
+ content of an XML element; it is expected to be the inTag
+ bound method of a QLocaleXmlWriter instance.
+
+ Optional second argument is a list of calendar names, in the
+ form used by CLDR; its default is ('gregorian',).
+ """
get = lambda k: getattr(self, k)
for key in ('language', 'script', 'country'):
- print inner + "<%s>" % key + get(key) + "</%s>" % key
- print inner + "<%scode>" % key + get(key + '_code') + "</%scode>" % key
+ write(key, get(key))
+ write('{}code'.format(key), get('{}_code'.format(key)))
for key in ('decimal', 'group', 'zero'):
- print inner + "<%s>" % key + ordStr(get(key)) + "</%s>" % key
+ write(key, ordStr(get(key)))
for key, std in (('list', ';'), ('percent', '%'),
('minus', '-'), ('plus', '+'), ('exp', 'e')):
- print inner + "<%s>" % key + fixOrdStr(get(key), std) + "</%s>" % key
+ write(key, fixOrdStr(get(key), std))
for key in ('languageEndonym', 'countryEndonym',
'quotationStart', 'quotationEnd',
@@ -206,16 +381,10 @@ class Locale:
'_'.join((k, cal))
for k in self.propsMonthDay('months')
for cal in calendars):
- print inner + "<%s>%s</%s>" % (key, escape(get(key)).encode('utf-8'), key)
+ write(key, escape(get(key)).encode('utf-8'))
for key in ('currencyDigits', 'currencyRounding'):
- print inner + "<%s>%d</%s>" % (key, get(key), key)
-
- print indent + "</locale>"
-
- def __init__(self, data=None, **kw):
- if data: self.__dict__.update(data)
- if kw: self.__dict__.update(kw)
+ write(key, get(key))
# Tools used by __monthNames:
def fullName(i, name): return name
@@ -261,8 +430,8 @@ class Locale:
for cal in calendars:
try:
data = known[cal]
- except KeyError: # Need to add an entry to known, above.
- print 'Unsupported calendar:', cal
+ except KeyError as e: # Need to add an entry to known, above.
+ e.args += ('Unsupported calendar:', cal)
raise
names, get = data[0] + ('',), data[1:]
for n, size in enumerate(sizes):
@@ -279,7 +448,7 @@ class Locale:
'Thursday', 'Friday', 'Saturday', ''),
quantifiers=('k', 'M', 'G', 'T', 'P', 'E')):
"""Returns an object representing the C locale."""
- return cls(dict(cls.__monthNames(calendars)),
+ return cls(cls.__monthNames(calendars),
language='C', language_code='0', languageEndonym='',
script='AnyScript', script_code='0',
country='AnyCountry', country_code='0', countryEndonym='',