From a20697a3940ede60b2fd5eac0ffd1a57b132191a Mon Sep 17 00:00:00 2001 From: Edward Welbourne Date: Wed, 19 Feb 2020 15:17:16 +0100 Subject: Rework cldr2qlocalexml.py in terms of a QLocaleXmlWriter class Delegate the output of XML to a helper class provided by qlocalexml.py and restructure the driver script so that it can be imported without running anything. It now has a minimal __name__ == '__main__' block that calls a main() function. This, for the moment, requires a global via which it shares the CLDR directory with various other functions; that shall go away in a later commit. Task-number: QTBUG-81344 Change-Id: Ica2d3ec09f2d38ba42fd930258cc765283f29a71 Reviewed-by: Cristian Maureira-Fredes --- util/locale_database/cldr2qlocalexml.py | 312 ++++++++++++++------------------ util/locale_database/qlocalexml.py | 215 +++++++++++++++++++--- 2 files changed, 330 insertions(+), 197 deletions(-) diff --git a/util/locale_database/cldr2qlocalexml.py b/util/locale_database/cldr2qlocalexml.py index ee53381b22..fba8d7fdd5 100755 --- a/util/locale_database/cldr2qlocalexml.py +++ b/util/locale_database/cldr2qlocalexml.py @@ -61,13 +61,13 @@ import enumdata import xpathlite from xpathlite import DraftResolution, findAlias, findEntry, findTagsInFile from dateconverter import convert_date -from qlocalexml import Locale +from qlocalexml import Locale, QLocaleXmlWriter # TODO: make calendars a command-line option calendars = ['gregorian', 'persian', 'islamic'] # 'hebrew' findEntryInFile = xpathlite._findEntryInFile -def wrappedwarn(prefix, tokens): - return sys.stderr.write( +def wrappedwarn(err, prefix, tokens): + return err.write( '\n'.join(textwrap.wrap(prefix + ', '.join(tokens), subsequent_indent=' ', width=80)) + '\n') @@ -101,6 +101,7 @@ def parse_number_format(patterns, data): result.append(pattern) return result +cldr_dir = None def raiseUnknownCode(code, form, cache={}): """Check whether an unknown code could be supported. @@ -193,8 +194,8 @@ def getNumberSystems(cache={}): """Cached look-up of number system information. Pass no arguments. Returns a mapping from number system names to, - for each system, a mapping with keys u'digits', u'type' and - u'id'\n""" + for each system, a mapping with keys 'digits', 'type' and 'id'. + Relies on global cldr_dir being set before it's first called.\n""" if not cache: for ns in findTagsInFile(os.path.join(cldr_dir, '..', 'supplemental', 'numberingSystems.xml'), @@ -419,26 +420,7 @@ def _generateLocaleInfo(path, language_code, script_code, country_code, variant_ return Locale(result) -def addEscapes(s): - result = '' - for c in s: - n = ord(c) - if n < 128: - result += c - else: - result += "\\x" - result += "%02x" % (n) - return result - -def unicodeStr(s): - utf8 = s.encode('utf-8') - return "" + str(len(utf8)) + "" + addEscapes(utf8) + "" - -def usage(): - print "Usage: cldr2qlocalexml.py " - sys.exit() - -def integrateWeekData(filePath): +def integrateWeekData(filePath, locale_database): if not filePath.endswith(".xml"): return {} @@ -510,111 +492,6 @@ def splitLocale(name): tag = (tag if tag else tags.next(),) sys.stderr.write('Ignoring unparsed cruft %s in %s\n' % ('_'.join(tag + tuple(tags)), name)) -if len(sys.argv) != 2: - usage() - -cldr_dir = sys.argv[1] - -if not os.path.isdir(cldr_dir): - usage() - -cldr_files = os.listdir(cldr_dir) - -locale_database = {} - -# see http://www.unicode.org/reports/tr35/tr35-info.html#Default_Content -defaultContent_locales = [] -for ns in findTagsInFile(os.path.join(cldr_dir, '..', 'supplemental', - 'supplementalMetadata.xml'), - 'metadata/defaultContent'): - for data in ns[1:][0]: - if data[0] == u"locales": - defaultContent_locales += data[1].split() - -skips = [] -for file in defaultContent_locales: - try: - language_code, script_code, country_code = splitLocale(file) - except ValueError: - sys.stderr.write('skipping defaultContent locale "' + file + '" [neither two nor three tags]\n') - continue - - if not (script_code or country_code): - sys.stderr.write('skipping defaultContent locale "' + file + '" [second tag is neither script nor territory]\n') - continue - - try: - l = _generateLocaleInfo(cldr_dir + "/" + file + ".xml", language_code, script_code, country_code) - if not l: - skips.append(file) - continue - except xpathlite.Error as e: - sys.stderr.write('skipping defaultContent locale "%s" (%s)\n' % (file, str(e))) - continue - - locale_database[(l.language_id, l.script_id, l.country_id, l.variant_code)] = l - -if skips: - wrappedwarn('skipping defaultContent locales [no locale info generated]: ', skips) - skips = [] - -for file in cldr_files: - try: - l = generateLocaleInfo(cldr_dir + "/" + file) - if not l: - skips.append(file) - continue - except xpathlite.Error as e: - sys.stderr.write('skipping file "%s" (%s)\n' % (file, str(e))) - continue - - locale_database[(l.language_id, l.script_id, l.country_id, l.variant_code)] = l - -if skips: - wrappedwarn('skipping files [no locale info generated]: ', skips) - -integrateWeekData(cldr_dir+"/../supplemental/supplementalData.xml") -locale_keys = locale_database.keys() -locale_keys.sort() - -cldr_version = 'unknown' -ldml = open(cldr_dir+"/../dtd/ldml.dtd", "r") -for line in ldml: - if 'version cldrVersion CDATA #FIXED' in line: - cldr_version = line.split('"')[1] - -print "" -print " " + cldr_version + "" -print " " -for id in enumdata.language_list: - l = enumdata.language_list[id] - print " " - print " " + l[0] + "" - print " " + str(id) + "" - print " " + l[1] + "" - print " " -print " " - -print " " -for id in enumdata.script_list: - l = enumdata.script_list[id] - print " " -print " " - -print " " -for id in enumdata.country_list: - l = enumdata.country_list[id] - print " " - print " " + l[0] + "" - print " " + str(id) + "" - print " " + l[1] + "" - print " " -print " " - def _parseLocale(l): language = "AnyLanguage" script = "AnyScript" @@ -651,48 +528,135 @@ def _parseLocale(l): return (language, script, country) -skips = [] -print " " -for ns in findTagsInFile(cldr_dir + "/../supplemental/likelySubtags.xml", "likelySubtags"): - tmp = {} - for data in ns[1:][0]: # ns looks like this: [u'likelySubtag', [(u'from', u'aa'), (u'to', u'aa_Latn_ET')]] - tmp[data[0]] = data[1] +def likelySubtags(root, err): + skips = [] + for ns in findTagsInFile(os.path.join(root, 'supplemental', 'likelySubtags.xml'), "likelySubtags"): + tmp = {} + for data in ns[1:][0]: # ns looks like this: [u'likelySubtag', [(u'from', u'aa'), (u'to', u'aa_Latn_ET')]] + tmp[data[0]] = data[1] + + try: + from_language, from_script, from_country = _parseLocale(tmp[u"from"]) + to_language, to_script, to_country = _parseLocale(tmp[u"to"]) + except xpathlite.Error as e: + if tmp[u'to'].startswith(tmp[u'from']) and str(e) == 'unknown language code "%s"' % tmp[u'from']: + skips.append(tmp[u'to']) + else: + sys.stderr.write('skipping likelySubtag "%s" -> "%s" (%s)\n' % (tmp[u"from"], tmp[u"to"], str(e))) + continue + # substitute according to http://www.unicode.org/reports/tr35/#Likely_Subtags + if to_country == "AnyCountry" and from_country != to_country: + to_country = from_country + if to_script == "AnyScript" and from_script != to_script: + to_script = from_script + + yield ((from_language, from_script, from_country), + (to_language, to_script, to_country)) + if skips: + wrappedwarn(err, 'skipping likelySubtags (for unknown language codes): ', skips) + +def usage(err, name, message = ''): + err.write("""Usage: {} [out-file.xml] +""".format(name)) # TODO: expand + if message: + err.write('\n' + message + '\n') + +def main(args, out, err): + name = args.pop(0) + + if len(args) < 1: + usage(err, name) + return 1 + + global cldr_dir + cldr_dir = args.pop(0) + if not os.path.isdir(cldr_dir): + usage(err, name, 'Where did you unpack the CLDR data files ?') + return 1 + + if len(args) > 1: + usage(err, name, 'Too many arguments passed') + return 1 + if args: + qxml = open(args.pop(0), 'w') + else: + qxml = out - try: - from_language, from_script, from_country = _parseLocale(tmp[u"from"]) - to_language, to_script, to_country = _parseLocale(tmp[u"to"]) - except xpathlite.Error as e: - if tmp[u'to'].startswith(tmp[u'from']) and str(e) == 'unknown language code "%s"' % tmp[u'from']: - skips.append(tmp[u'to']) - else: - sys.stderr.write('skipping likelySubtag "%s" -> "%s" (%s)\n' % (tmp[u"from"], tmp[u"to"], str(e))) - continue - # substitute according to http://www.unicode.org/reports/tr35/#Likely_Subtags - if to_country == "AnyCountry" and from_country != to_country: - to_country = from_country - if to_script == "AnyScript" and from_script != to_script: - to_script = from_script - - print " " - print " " - print " " + from_language + "" - print " " - print " " + from_country + "" - print " " - print " " - print " " + to_language + "" - print " " - print " " + to_country + "" - print " " - print " " -print " " -if skips: - wrappedwarn('skipping likelySubtags (for unknown language codes): ', skips) -print " " - -Locale.C(calendars).toXml(calendars) -for key in locale_keys: - locale_database[key].toXml(calendars) - -print " " -print "" + getNumberSystems(cldr_dir) + cldr_files = os.listdir(cldr_dir) + locale_database = {} + + # see http://www.unicode.org/reports/tr35/tr35-info.html#Default_Content + defaultContent_locales = [] + for ns in findTagsInFile(os.path.join(cldr_dir, '..', 'supplemental', + 'supplementalMetadata.xml'), + 'metadata/defaultContent'): + for data in ns[1:][0]: + if data[0] == u"locales": + defaultContent_locales += data[1].split() + + skips = [] + for file in defaultContent_locales: + try: + language_code, script_code, country_code = splitLocale(file) + except ValueError: + sys.stderr.write('skipping defaultContent locale "' + file + '" [neither two nor three tags]\n') + continue + + if not (script_code or country_code): + sys.stderr.write('skipping defaultContent locale "' + file + '" [second tag is neither script nor territory]\n') + continue + + try: + l = _generateLocaleInfo(cldr_dir + "/" + file + ".xml", language_code, script_code, country_code) + if not l: + skips.append(file) + continue + except xpathlite.Error as e: + sys.stderr.write('skipping defaultContent locale "{}" ({})\n'.format(file, str(e))) + continue + + locale_database[(l.language_id, l.script_id, l.country_id, l.variant_code)] = l + + if skips: + wrappedwarn(err, 'skipping defaultContent locales [no locale info generated]: ', skips) + skips = [] + + for file in cldr_files: + try: + l = generateLocaleInfo(cldr_dir + "/" + file) + if not l: + skips.append(file) + continue + except xpathlite.Error as e: + sys.stderr.write('skipping file "{}" ({})\n'.format(file, str(e))) + continue + + locale_database[(l.language_id, l.script_id, l.country_id, l.variant_code)] = l + + if skips: + wrappedwarn(err, 'skipping files [no locale info generated]: ', skips) + + integrateWeekData(cldr_dir + "/../supplemental/supplementalData.xml", locale_database) + cldr_version = 'unknown' + with open(cldr_dir+"/../dtd/ldml.dtd", "r") as ldml: + for line in ldml: + if 'version cldrVersion CDATA #FIXED' in line: + cldr_version = line.split('"')[1] + + xmlOut = QLocaleXmlWriter(qxml.write) + xmlOut.version(cldr_version) + xmlOut.enumData(enumdata.language_list, + enumdata.script_list, + enumdata.country_list) + xmlOut.likelySubTags(likelySubtags(os.path.split(cldr_dir)[0], err)) + xmlOut.locales(locale_database, calendars) + xmlOut.close() + if qxml is not out: + qxml.close() + + return 0 + +if __name__ == '__main__': + import sys + sys.exit(main(sys.argv, sys.stdout, sys.stderr)) diff --git a/util/locale_database/qlocalexml.py b/util/locale_database/qlocalexml.py index 87e356b8a7..b64ff56c64 100644 --- a/util/locale_database/qlocalexml.py +++ b/util/locale_database/qlocalexml.py @@ -1,7 +1,7 @@ # coding=utf8 ############################################################################# ## -## Copyright (C) 2018 The Qt Company Ltd. +## Copyright (C) 2020 The Qt Company Ltd. ## Contact: https://www.qt.io/licensing/ ## ## This file is part of the test suite of the Qt Toolkit. @@ -28,11 +28,17 @@ ############################################################################# """Shared serialization-scanning code for QLocaleXML format. -The Locale class is written by cldr2qlocalexml.py and read by qlocalexml2cpp.py +Provides classes: + Locale -- common data-type representing one locale as a namespace + QLocaleXmlWriter -- helper to write a QLocaleXML file + +Support: + Spacer -- provides control over indentation of the output. """ +from __future__ import print_function from xml.sax.saxutils import escape -import xpathlite +from xpathlite import Error # Tools used by Locale: def camel(seq): @@ -43,10 +49,14 @@ def camel(seq): def camelCase(words): return ''.join(camel(iter(words))) +def addEscapes(s): + return ''.join(c if n < 128 else '\\x{:02x}'.format(n) + for n, c in ((ord(c), c) for c in s)) + def ordStr(c): if len(c) == 1: return str(ord(c)) - raise xpathlite.Error('Unable to handle value "%s"' % addEscapes(c)) + raise Error('Unable to handle value "{}"'.format(addEscapes(c))) # Fix for a problem with QLocale returning a character instead of # strings for QLocale::exponential() and others. So we fallback to @@ -69,6 +79,8 @@ def convertFormat(format): * https://www.unicode.org/reports/tr35/tr35-dates.html#Date_Field_Symbol_Table * QDateTimeParser::parseFormat() and QLocalePrivate::dateTimeToString() """ + # Compare and contrast dateconverter.py's convert_date(). + # Need to (check consistency and) reduce redundancy ! result = "" i = 0 while i < len(format): @@ -113,7 +125,163 @@ def convertFormat(format): return result -class Locale: +class Spacer (object): + def __init__(self, indent = None, initial = ''): + """Prepare to manage indentation and line breaks. + + Arguments are both optional. + + First argument, indent, is either None (its default, for + 'minifying'), an ingeter (number of spaces) or the unit of + text that is to be used for each indentation level (e.g. '\t' + to use tabs). If indent is None, no indentation is added, nor + are line-breaks; otherwise, self(text), for non-empty text, + shall end with a newline and begin with indentation. + + Second argument, initial, is the initial indentation; it is + ignored if indent is None. Indentation increases after each + call to self(text) in which text starts with a tag and doesn't + include its end-tag; indentation decreases if text starts with + an end-tag. The text is not parsed any more carefully than + just described. + """ + if indent is None: + self.__call = lambda x: x + else: + self.__each = ' ' * indent if isinstance(indent, int) else indent + self.current = initial + self.__call = self.__wrap + + def __wrap(self, line): + if not line: + return '\n' + + indent = self.current + if line.startswith('') + tag = (line[1:] if cut < 0 else line[1 : cut]).strip().split()[0] + if ''.format(tag) not in line: + self.current += self.__each + return indent + line + '\n' + + def __call__(self, line): + return self.__call(line) + +class QLocaleXmlWriter (object): + def __init__(self, save = None, space = Spacer(4)): + """Set up to write digested CLDR data as QLocale XML. + + Arguments are both optional. + + First argument, save, is None (its default) or a callable that + will write content to where you intend to save it. If None, it + is replaced with a callable that prints the given content, + suppressing the newline (but see the following); this is + equivalent to passing sys.stdout.write. + + Second argument, space, is an object to call on each text + output to prepend indentation and append newlines, or not as + the case may be. The default is a Spacer(4), which grows + indent by four spaces after each unmatched new tag and shrinks + back on a close-tag (its parsing is naive, but adequate to how + this class uses it), while adding a newline to each line. + """ + self.__rawOutput = self.__printit if save is None else save + self.__wrap = space + self.__write('') + + # Output of various sections, in their usual order: + def enumData(self, languages, scripts, countries): + self.__enumTable('languageList', languages) + self.__enumTable('scriptList', scripts) + self.__enumTable('countryList', countries) + + def likelySubTags(self, entries): + self.__openTag('likelySubtags') + for have, give in entries: + self.__openTag('likelySubtag') + self.__likelySubTag('from', have) + self.__likelySubTag('to', give) + self.__closeTag('likelySubtag') + self.__closeTag('likelySubtags') + + def locales(self, locales, calendars): + self.__openTag('localeList') + self.__openTag('locale') + Locale.C(calendars).toXml(self.inTag, calendars) + self.__closeTag('locale') + keys = locales.keys() + keys.sort() + for key in keys: + self.__openTag('locale') + locales[key].toXml(self.inTag, calendars) + self.__closeTag('locale') + self.__closeTag('localeList') + + def version(self, cldrVersion): + self.inTag('version', cldrVersion) + + def inTag(self, tag, text): + self.__write('<{0}>{1}'.format(tag, text)) + + def close(self): + if self.__rawOutput != self.__complain: + self.__write('') + self.__rawOutput = self.__complain + + # Implementation details + @staticmethod + def __printit(text): + print(text, end='') + @staticmethod + def __complain(text): + raise Error('Attempted to write data after closing :-(') + + def __enumTable(self, tag, table): + self.__openTag(tag) + for key, value in table.iteritems(): + self.__openTag(tag[:-4]) + self.inTag('name', value[0]) + self.inTag('id', key) + self.inTag('code', value[1]) + self.__closeTag(tag[:-4]) + self.__closeTag(tag) + + def __likelySubTag(self, tag, likely): + self.__openTag(tag) + self.inTag('language', likely[0]) + self.inTag('script', likely[1]) + self.inTag('country', likely[2]) + # self.inTag('variant', likely[3]) + self.__closeTag(tag) + + def __openTag(self, tag): + self.__write('<{}>'.format(tag)) + def __closeTag(self, tag): + self.__write(''.format(tag)) + + def __write(self, line): + self.__rawOutput(self.__wrap(line)) + +class Locale (object): + """Holder for the assorted data representing one locale. + + Implemented as a namespace; its constructor and update() have the + same signatures as those of a dict, acting on the instance's + __dict__, so the results are accessed as attributes rather than + mapping keys.""" + def __init__(self, data=None, **kw): + self.update(data, **kw) + + def update(self, data=None, **kw): + if data: self.__dict__.update(data) + if kw: self.__dict__.update(kw) + + def __len__(self): # Used when testing as a boolean + return len(self.__dict__) + @staticmethod def propsMonthDay(scale, lengths=('long', 'short', 'narrow')): for L in lengths: @@ -176,19 +344,26 @@ class Locale: return cls(data) - def toXml(self, calendars=('gregorian',), indent=' ', tab=' '): - print indent + '' - inner = indent + tab + def toXml(self, write, calendars=('gregorian',)): + """Writes its data as QLocale XML. + + First argument, write, is a callable taking the name and + content of an XML element; it is expected to be the inTag + bound method of a QLocaleXmlWriter instance. + + Optional second argument is a list of calendar names, in the + form used by CLDR; its default is ('gregorian',). + """ get = lambda k: getattr(self, k) for key in ('language', 'script', 'country'): - print inner + "<%s>" % key + get(key) + "" % key - print inner + "<%scode>" % key + get(key + '_code') + "" % key + write(key, get(key)) + write('{}code'.format(key), get('{}_code'.format(key))) for key in ('decimal', 'group', 'zero'): - print inner + "<%s>" % key + ordStr(get(key)) + "" % key + write(key, ordStr(get(key))) for key, std in (('list', ';'), ('percent', '%'), ('minus', '-'), ('plus', '+'), ('exp', 'e')): - print inner + "<%s>" % key + fixOrdStr(get(key), std) + "" % key + write(key, fixOrdStr(get(key), std)) for key in ('languageEndonym', 'countryEndonym', 'quotationStart', 'quotationEnd', @@ -206,16 +381,10 @@ class Locale: '_'.join((k, cal)) for k in self.propsMonthDay('months') for cal in calendars): - print inner + "<%s>%s" % (key, escape(get(key)).encode('utf-8'), key) + write(key, escape(get(key)).encode('utf-8')) for key in ('currencyDigits', 'currencyRounding'): - print inner + "<%s>%d" % (key, get(key), key) - - print indent + "" - - def __init__(self, data=None, **kw): - if data: self.__dict__.update(data) - if kw: self.__dict__.update(kw) + write(key, get(key)) # Tools used by __monthNames: def fullName(i, name): return name @@ -261,8 +430,8 @@ class Locale: for cal in calendars: try: data = known[cal] - except KeyError: # Need to add an entry to known, above. - print 'Unsupported calendar:', cal + except KeyError as e: # Need to add an entry to known, above. + e.args += ('Unsupported calendar:', cal) raise names, get = data[0] + ('',), data[1:] for n, size in enumerate(sizes): @@ -279,7 +448,7 @@ class Locale: 'Thursday', 'Friday', 'Saturday', ''), quantifiers=('k', 'M', 'G', 'T', 'P', 'E')): """Returns an object representing the C locale.""" - return cls(dict(cls.__monthNames(calendars)), + return cls(cls.__monthNames(calendars), language='C', language_code='0', languageEndonym='', script='AnyScript', script_code='0', country='AnyCountry', country_code='0', countryEndonym='', -- cgit v1.2.3