diff options
Diffstat (limited to 'util')
-rw-r--r-- | util/locale_database/cldr.py | 498 | ||||
-rwxr-xr-x | util/locale_database/cldr2qlocalexml.py | 636 | ||||
-rw-r--r-- | util/locale_database/ldml.py | 450 | ||||
-rwxr-xr-x | util/locale_database/qlocalexml2cpp.py | 2 | ||||
-rw-r--r-- | util/locale_database/xpathlite.py | 284 |
5 files changed, 972 insertions, 898 deletions
diff --git a/util/locale_database/cldr.py b/util/locale_database/cldr.py index 7890adf307..94459b9e3f 100644 --- a/util/locale_database/cldr.py +++ b/util/locale_database/cldr.py @@ -27,7 +27,8 @@ ############################################################################# """Digesting the CLDR's data. -Provides two class: +Provides two classes: + CldrReader -- driver for reading CLDR data CldrAccess -- used by the reader to access the tree of data files The former should normally be all you need to access. @@ -38,9 +39,206 @@ from xml.dom import minidom from weakref import WeakValueDictionary as CacheDict import os -from localetools import Error -from ldml import Node, Supplement +from ldml import Error, Node, XmlScanner, Supplement, LocaleScanner +from qlocalexml import Locale +class CldrReader (object): + def __init__(self, root, grumble = lambda msg: None, whitter = lambda msg: None): + """Set up a reader object for reading CLDR data. + + Single parameter, root, is the file-system path to the root of + the unpacked CLDR archive; its common/ sub-directory should + contain dtd/, main/ and supplemental/ sub-directories. + + Optional second argument, grumble, is a callable that logs + warnings and complaints, e.g. sys.stderr.write would be a + suitable callable. The default is a no-op that ignores its + single argument. Optional third argument is similar, used for + less interesting output; pass sys.stderr.write for it for + verbose output.""" + self.root = CldrAccess(root) + self.whitter, self.grumble = whitter, grumble + + def likelySubTags(self): + """Generator for likely subtag information. + + Yields pairs (have, give) of 4-tuples; if what you have + matches the left member, giving the right member is probably + sensible. Each 4-tuple's entries are the full names of a + language, a script, a country (strictly territory) and a + variant (currently ignored).""" + skips = [] + for got, use in self.root.likelySubTags(): + try: + have = self.__parseTags(got) + give = self.__parseTags(use) + except Error as e: + if ((use.startswith(got) or got.startswith('und_')) + and e.message.startswith('Unknown ') and ' code ' in e.message): + skips.append(use) + else: + self.grumble('Skipping likelySubtag "{}" -> "{}" ({})\n'.format(got, use, e.message)) + continue + if all(code.startswith('Any') and code[3].isupper() for code in have[:-1]): + continue + + give = (give[0], + # Substitute according to http://www.unicode.org/reports/tr35/#Likely_Subtags + have[1] if give[1] == 'AnyScript' else give[1], + have[2] if give[2] == 'AnyCountry' else give[2], + give[3]) # AnyVariant similarly ? + + yield have, give + + if skips: + # TODO: look at LDML's reserved locale tag names; they + # show up a lot in this, and may be grounds for filtering + # more out. + pass # self.__wrapped(self.whitter, 'Skipping likelySubtags (for unknown codes): ', skips) + + def readLocales(self, calendars = ('gregorian',)): + locales = tuple(self.__allLocales(calendars)) + return dict(((k.language_id, k.script_id, k.country_id, k.variant_code), + k) for k in locales) + + def __allLocales(self, calendars): + def skip(locale, reason): + return 'Skipping defaultContent locale "{}" ({})\n'.format(locale, reason) + + for locale in self.root.defaultContentLocales: + try: + language, script, country, variant = self.__splitLocale(locale) + except ValueError: + self.whitter(skip(locale, 'only language tag')) + continue + + if not (script or country): + self.grumble(skip(locale, 'second tag is neither script nor territory')) + continue + + if not (language and country): + continue + + try: + yield self.__getLocaleData(self.root.locale(locale), calendars, + language, script, country, variant) + except Error as e: + self.grumble(skip(locale, e.message)) + + for locale in self.root.fileLocales: + try: + chain = self.root.locale(locale) + language, script, country, variant = chain.tagCodes() + assert language + # TODO: this skip should probably be based on likely + # sub-tags, instead of empty country: if locale has a + # likely-subtag expansion, that's what QLocale uses, + # and we'll be saving its data for the expanded locale + # anyway, so don't need to record it for itself. + # See also QLocaleXmlReader.loadLocaleMap's grumble. + if not country: + continue + yield self.__getLocaleData(chain, calendars, language, script, country, variant) + except Error as e: + self.grumble('Skipping file locale "{}" ({})\n'.format(locale, e.message)) + + import textwrap + @staticmethod + def __wrapped(writer, prefix, tokens, wrap = textwrap.wrap): + writer('\n'.join(wrap(prefix + ', '.join(tokens), + subsequent_indent=' ', width=80)) + '\n') + del textwrap + + def __parseTags(self, locale): + tags = self.__splitLocale(locale) + language = tags.next() + script = country = variant = '' + try: + script, country, variant = tags + except ValueError: + pass + return tuple(p[1] for p in self.root.codesToIdName(language, script, country, variant)) + + def __splitLocale(self, name): + """Generate (language, script, territory, variant) from a locale name + + Ignores any trailing fields (with a warning), leaves script (a + capitalised four-letter token), territory (either a number or + an all-uppercase token) or variant (upper case and digits) + empty if unspecified. Only generates one entry if name is a + single tag (i.e. contains no underscores). Always yields 1 or + 4 values, never 2 or 3.""" + tags = iter(name.split('_')) + yield tags.next() # Language + tag = tags.next() # may raise StopIteration + + # Script is always four letters, always capitalised: + if len(tag) == 4 and tag[0].isupper() and tag[1:].islower(): + yield tag + try: + tag = tags.next() + except StopIteration: + tag = '' + else: + yield '' + + # Territory is upper-case or numeric: + if tag and tag.isupper() or tag.isdigit(): + yield tag + try: + tag = tags.next() + except StopIteration: + tag = '' + else: + yield '' + + # Variant can be any mixture of upper-case and digits. + if tag and all(c.isupper() or c.isdigit() for c in tag): + yield tag + tag = '' + else: + yield '' + + # If nothing is left, StopIteration will avoid the warning: + if not tag: + tag = tags.next() + self.grumble('Ignoring unparsed cruft {} in {}\n'.format('_'.join(tag + tuple(tags)), name)) + + def __getLocaleData(self, scan, calendars, language, script, country, variant): + ids, names = zip(*self.root.codesToIdName(language, script, country, variant)) + assert ids[0] > 0 and ids[2] > 0, (language, script, country, variant) + locale = Locale( + language = names[0], language_code = language, language_id = ids[0], + script = names[1], script_code = script, script_id = ids[1], + country = names[2], country_code = country, country_id = ids[2], + variant_code = variant) + + firstDay, weStart, weEnd = self.root.weekData(country) + assert all(day in ('mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun') + for day in (firstDay, weStart, weEnd)) + + locale.update(firstDayOfWeek = firstDay, + weekendStart = weStart, + weekendEnd = weEnd) + + iso, digits, rounding = self.root.currencyData(country) + locale.update(currencyIsoCode = iso, + currencyDigits = int(digits), + currencyRounding = int(rounding)) + + locale.update(scan.currencyData(iso)) + locale.update(scan.numericData(self.root.numberSystem, self.whitter)) + locale.update(scan.textPatternData()) + locale.update(scan.endonyms(language, script, country, variant)) + locale.update(scan.unitData()) # byte, kB, MB, GB, ..., KiB, MiB, GiB, ... + locale.update(scan.calendarNames(calendars)) # Names of days and months + + return locale + +# Note: various caches assume this class is a singleton, so the +# "default" value for a parameter no caller should pass can serve as +# the cache. If a process were to instantiate this class with distinct +# roots, each cache would be filled by the first to need it ! class CldrAccess (object): def __init__(self, root): """Set up a master object for accessing CLDR data. @@ -50,6 +248,12 @@ class CldrAccess (object): contain dtd/, main/ and supplemental/ sub-directories.""" self.root = root + def xml(self, *path): + """Load a single XML file and return its root element as an XmlScanner. + + The path is interpreted relative to self.root""" + return XmlScanner(Node(self.__xml(path))) + def supplement(self, name): """Loads supplemental data as a Supplement object. @@ -57,6 +261,117 @@ class CldrAccess (object): """ return Supplement(Node(self.__xml(('common', 'supplemental', name)))) + def locale(self, name): + """Loads all data for a locale as a LocaleScanner object. + + The name should be a locale name; adding suffix '.xml' to it + should usually yield a file in common/main/. The returned + LocaleScanner object packages this file along with all those + from which it inherits; its methods know how to handle that + inheritance, where relevant.""" + return LocaleScanner(name, self.__localeRoots(name), self.__rootLocale) + + @property + def fileLocales(self, joinPath = os.path.join, listDirectory = os.listdir, + splitExtension = os.path.splitext): + """Generator for locale IDs seen in file-names. + + All *.xml other than root.xml in common/main/ are assumed to + identify locales.""" + for name in listDirectory(joinPath(self.root, 'common', 'main')): + stem, ext = splitExtension(name) + if ext == '.xml' and stem != 'root': + yield stem + + @property + def defaultContentLocales(self): + """Generator for the default content locales.""" + for name, attrs in self.supplement('supplementalMetadata.xml').find('metadata/defaultContent'): + try: + locales = attrs['locales'] + except KeyError: + pass + else: + for locale in locales.split(): + yield locale + + def likelySubTags(self): + for ignore, attrs in self.supplement('likelySubtags.xml').find('likelySubtags'): + yield attrs['from'], attrs['to'] + + def numberSystem(self, system): + """Get a description of a numbering system. + + Returns a mapping, with keys u'digits', u'type' and u'id'; the + value for this last is system. Raises KeyError for unknown + number system, ldml.Error on failure to load data.""" + try: + return self.__numberSystems[system] + except KeyError: + raise Error('Unsupported number system: {}'.format(system)) + + def weekData(self, country): + """Data on the weekly cycle. + + Returns a triple (W, S, E) of en's short names for week-days; + W is the first day of the week, S the start of the week-end + and E the end of the week-end. Where data for a country is + unavailable, the data for CLDR's territory 001 (The World) is + used.""" + try: + return self.__weekData[country] + except KeyError: + return self.__weekData['001'] + + def currencyData(self, country): + """Returns currency data for the given country code. + + Return value is a tuple (ISO4217 code, digit count, rounding + mode). If CLDR provides no data for this country, ('', 2, 1) + is the default result. + """ + try: + return self.__currencyData[country] + except KeyError: + return '', 2, 1 + + def codesToIdName(self, language, script, country, variant = ''): + """Maps each code to the appropriate ID and name. + + Returns a 4-tuple of (ID, name) pairs corresponding to the + language, script, country and variant given. Raises a + suitable error if any of them is unknown, indicating all that + are unknown plus suitable names for any that could sensibly be + added to enumdata.py to make them known. + + Until we implement variant support (QTBUG-81051), the fourth + member of the returned tuple is always 0 paired with a string + that should not be used.""" + enum = self.__enumMap + try: + return (enum('language')[language], + enum('script')[script], + enum('country')[country], + enum('variant')[variant]) + except KeyError: + pass + + parts, values = [], [language, script, country, variant] + for index, key in enumerate(('language', 'script', 'country', 'variant')): + naming, enums = self.__codeMap(key), enum(key) + value = values[index] + if value not in enums: + text = '{} code {}'.format(key, value) + name = naming.get(value) + if name and value != 'POSIX': + text += u' (could add {})'.format(name) + parts.append(text) + if len(parts) > 1: + parts[-1] = 'and ' + parts[-1] + assert parts + raise Error('Unknown ' + ', '.join(parts), + language, script, country, variant) + def readWindowsTimeZones(self, lookup): # For use by cldr2qtimezone.py """Digest CLDR's MS-Win time-zone name mapping. @@ -139,11 +454,97 @@ class CldrAccess (object): return open(joinPath(self.root, *path)) @property + def __rootLocale(self, cache = []): + if not cache: + cache.append(self.xml('common', 'main', 'root.xml')) + return cache[0] + + @property def __supplementalData(self, cache = []): if not cache: cache.append(self.supplement('supplementalData.xml')) return cache[0] + @property + def __numberSystems(self, cache = {}, joinPath=os.path.join): + if not cache: + for ignore, attrs in self.supplement('numberingSystems.xml').find('numberingSystems'): + if ord(attrs.get('digits', u'\x10000')[0]) > 0xffff: + # FIXME, QTBUG-69324: make this redundant: + # omit number system if zero doesn't fit in single-char16 UTF-16 :-( + continue + + cache[attrs['id']] = attrs + assert cache + return cache + + @property + def __weekData(self, cache = {}): + if not cache: + firstDay, weStart, weEnd = self.__getWeekData() + # Massage those into an easily-consulted form: + # World defaults given for code '001': + mon, sat, sun = firstDay['001'], weStart['001'], weEnd['001'] + lands = set(firstDay) | set(weStart) | set(weEnd) + cache.update((land, + (firstDay.get(land, mon), weStart.get(land, sat), weEnd.get(land, sun))) + for land in lands) + assert cache + return cache + + def __getWeekData(self): + """Scan for data on the weekly cycle. + + Yields three mappings from locales to en's short names for + week-days; if a locale isn't a key of a given mapping, it + should use the '001' (world) locale's value. The first mapping + gives the day on which the week starts, the second gives the + day on which the week-end starts, the third gives the last day + of the week-end.""" + source = self.__supplementalData + for key in ('firstDay', 'weekendStart', 'weekendEnd'): + result = {} + for ignore, attrs in source.find('weekData/' + key): + assert ignore == key + day = attrs['day'] + assert day in ('mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun'), day + if 'alt' in attrs: + continue + for loc in attrs.get('territories', '').split(): + result[loc] = day + yield result + + @property + def __currencyData(self, cache = {}): + if not cache: + source = self.__supplementalData + for elt in source.findNodes('currencyData/region'): + iso, digits, rounding = '', 2, 1 + try: + country = elt.dom.attributes['iso3166'].nodeValue + except KeyError: + continue + for child in elt.findAllChildren('currency'): + try: + if child.dom.attributes['tender'].nodeValue == 'false': + continue + except KeyError: + pass + try: + child.dom.attributes['to'] # Is set if this element has gone out of date. + except KeyError: + iso = child.dom.attributes['iso4217'].nodeValue + break + if iso: + for tag, data in source.find( + 'currencyData/fractions/info[iso4217={}]'.format(iso)): + digits = data['digits'] + rounding = data['rounding'] + cache[country] = iso, digits, rounding + assert cache + + return cache + def __scanLdmlDtd(self, joinPath = os.path.join): """Scan the LDML DTD, record CLDR version.""" with self.__open(('common', 'dtd', 'ldml.dtd')) as dtd: @@ -151,7 +552,8 @@ class CldrAccess (object): if line.startswith('<!ATTLIST '): parts = line.split() if parts[1:5] == ['version', 'cldrVersion', 'CDATA', '#FIXED']: - # parts[5] is the version, in quotes, although the final > might be stuck on its end: + # parts[5] is the version, in quotes, maybe + # with a final > attached to its end: self.__cldrVersion = parts[5].split('"')[1] break @@ -178,5 +580,93 @@ class CldrAccess (object): return cache[key] + def __codeMap(self, key, cache = {}, + # Maps our name for it to CLDR's name: + naming = {'language': 'languages', 'script': 'scripts', + 'country': 'territories', 'variant': 'variants'}): + if not cache: + root = self.xml('common', 'main', 'en.xml').root.findUniqueChild('localeDisplayNames') + for dst, src in naming.items(): + cache[dst] = dict(self.__codeMapScan(root.findUniqueChild(src))) + assert cache + + return cache[key] + + def __codeMapScan(self, node): + """Get mapping from codes to element values. + + Passed in node is a <languages>, <scripts>, <territories> or + <variants> node, each child of which is a <language>, + <script>, <territory> or <variant> node as appropriate, whose + type is a code (of the appropriate flavour) and content is its + full name. In some cases, two child nodes have the same type; + in these cases, one always has an alt attribute and we should + prefer the other. Yields all such type, content pairs found + in node's children (skipping any with an alt attribute, if + their type has been seen previously).""" + seen = set() + for elt in node.dom.childNodes: + try: + key, value = elt.attributes['type'].nodeValue, elt.childNodes[0].wholeText + except (KeyError, ValueError, TypeError): + pass + else: + if key not in seen or not elt.attributes.has_key('alt'): + yield key, value + seen.add(key) + + # CLDR uses inheritance between locales to save repetition: + def __parentLocale(self, name, cache = {}): + # see http://www.unicode.org/reports/tr35/#Parent_Locales + if not cache: + for tag, attrs in self.__supplementalData.find('parentLocales'): + parent = attrs.get('parent', '') + for child in attrs['locales'].split(): + cache[child] = parent + assert cache + + return cache[name] + + def __localeAsDoc(self, name, aliasFor = None, + joinPath = os.path.join, exists = os.path.isfile): + path = ('common', 'main', name + '.xml') + if exists(joinPath(self.root, *path)): + elt = self.__xml(path) + for child in Node(elt).findAllChildren('alias'): + try: + alias = child.dom.attributes['source'].nodeValue + except (KeyError, AttributeError): + pass + else: + return self.__localeAsDoc(alias, aliasFor or name) + # No alias child with a source: + return elt + + if aliasFor: + raise Error('Fatal error: found an alias "{}" -> "{}", but found no file for the alias' + .format(aliasFor, name)) + + def __scanLocaleRoots(self, name): + while name and name != 'root': + doc = self.__localeAsDoc(name) + if doc is not None: + yield Node(doc) + + try: + name = self.__parentLocale(name) + except KeyError: + try: + name, tail = name.rsplit('_', 1) + except ValueError: # No tail to discard: we're done + break + + class __Seq (list): pass # No weakref for tuple and list, but list sub-class is ok. + def __localeRoots(self, name, cache = CacheDict()): + try: + chain = cache[name] + except KeyError: + cache[name] = chain = self.__Seq(self.__scanLocaleRoots(name)) + return chain + # Unpolute the namespace: we don't need to export these. del minidom, CacheDict, os diff --git a/util/locale_database/cldr2qlocalexml.py b/util/locale_database/cldr2qlocalexml.py index 41795ff634..b28dcecc45 100755 --- a/util/locale_database/cldr2qlocalexml.py +++ b/util/locale_database/cldr2qlocalexml.py @@ -2,7 +2,7 @@ # coding=utf8 ############################################################################# ## -## Copyright (C) 2018 The Qt Company Ltd. +## Copyright (C) 2020 The Qt Company Ltd. ## Contact: https://www.qt.io/licensing/ ## ## This file is part of the test suite of the Qt Toolkit. @@ -31,15 +31,17 @@ The CLDR data can be downloaded from CLDR_, which has a sub-directory for each version; you need the ``core.zip`` file for your version of -choice (typically the latest). This script has had updates to cope up -to v35; for later versions, we may need adaptations. Unpack the +choice (typically the latest). This script has had updates to cope up +to v35; for later versions, we may need adaptations. Unpack the downloaded ``core.zip`` and check it has a common/main/ sub-directory: -pass the path of that sub-directory to this script as its single -command-line argument. Save its standard output (but not error) to a -file for later processing by ``./qlocalexml2cpp.py`` +pass the path of that root of the download to this script as its first +command-line argument. Pass the name of the file in which to write +output as the second argument; either omit it or use '-' to select the +standard output. This file is the input needed by +``./qlocalexml2cpp.py`` When you update the CLDR data, be sure to also update -src/corelib/text/qt_attribution.json's entry for unicode-cldr. Check +src/corelib/text/qt_attribution.json's entry for unicode-cldr. Check this script's output for unknown language, country or script messages; if any can be resolved, use their entry in common/main/en.xml to append new entries to enumdata.py's lists and update documentation in @@ -53,610 +55,62 @@ time zone names; see cldr2qtimezone.py for details. """ import os -import sys -import re -import textwrap -import enumdata from localetools import Error -from xpathlite import DraftResolution, findAlias, findEntry, findTagsInFile, codeMapsFromFile, \ - _findEntryInFile as findEntryInFile -from dateconverter import convert_date -from qlocalexml import Locale, QLocaleXmlWriter - -# TODO: make calendars a command-line option -calendars = ['gregorian', 'persian', 'islamic'] # 'hebrew' -def wrappedwarn(err, prefix, tokens): - return err.write( - '\n'.join(textwrap.wrap(prefix + ', '.join(tokens), - subsequent_indent=' ', width=80)) + '\n') - -def parse_number_format(patterns, data): - # this is a very limited parsing of the number format for currency only. - def skip_repeating_pattern(x): - p = x.replace('0', '#').replace(',', '').replace('.', '') - seen = False - result = '' - for c in p: - if c == '#': - if seen: - continue - seen = True - else: - seen = False - result = result + c - return result - patterns = patterns.split(';') - result = [] - for pattern in patterns: - pattern = skip_repeating_pattern(pattern) - pattern = pattern.replace('#', "%1") - # according to http://www.unicode.org/reports/tr35/#Number_Format_Patterns - # there can be doubled or trippled currency sign, however none of the - # locales use that. - pattern = pattern.replace(u'\xa4', "%2") - pattern = pattern.replace("''", "###").replace("'", '').replace("###", "'") - pattern = pattern.replace('-', data['minus']) - pattern = pattern.replace('+', data['plus']) - result.append(pattern) - return result - -cldr_dir = None -def raiseUnknownCode(code, form, cache={}): - """Check whether an unknown code could be supported. - - We declare a language, script or country code unknown if it's not - known to enumdata.py; however, if it's present in main/en.xml's - mapping of codes to names, we have the option of adding support. - This caches the necessary look-up (so we only read main/en.xml - once) and returns the name we should use if we do add support. - - First parameter, code, is the unknown code. Second parameter, - form, is one of 'language', 'script' or 'country' to select the - type of code to look up. Do not pass further parameters (the next - will deprive you of the cache). - - Raises localetools.Error with a suitable message, that includes - the unknown code's full name if found. - - Relies on global cldr_dir being set before it's called; see tail - of this file. - """ - if not cache: - cache.update(codeMapsFromFile(os.path.join(cldr_dir, 'en.xml'))) - name = cache[form].get(code) - msg = 'unknown %s code "%s"' % (form, code) - if name: - msg += ' - could use "%s"' % name - raise Error(msg) - -def parse_list_pattern_part_format(pattern): - # This is a very limited parsing of the format for list pattern part only. - return pattern.replace("{0}", "%1").replace("{1}", "%2").replace("{2}", "%3") - -def unit_quantifiers(find, path, stem, suffix, known, - # Stop at exa/exbi: 16 exbi = 2^{64} < zetta = - # 1000^7 < zebi = 2^{70}, the next quantifiers up: - si_quantifiers = ('kilo', 'mega', 'giga', 'tera', 'peta', 'exa')): - """Work out the unit quantifiers. - - Unfortunately, the CLDR data only go up to terabytes and we want - all the way to exabytes; but we can recognize the SI quantifiers - as prefixes, strip and identify the tail as the localized - translation for 'B' (e.g. French has 'octet' for 'byte' and uses - ko, Mo, Go, To from which we can extrapolate Po, Eo). - - Should be called first for the SI quantifiers, with suffix = 'B', - then for the IEC ones, with suffix = 'iB'; the list known - (initially empty before first call) is used to let the second call - know what the first learned about the localized unit. - """ - if suffix == 'B': # first call, known = [] - tail = suffix - for q in si_quantifiers: - it = find(path, stem % q) - # kB for kilobyte, in contrast with KiB for IEC: - q = q[0] if q == 'kilo' else q[0].upper() - if not it: - it = q + tail - elif it.startswith(q): - rest = it[1:] - tail = rest if all(rest == k for k in known) else suffix - known.append(rest) - yield it - else: # second call, re-using first's known - assert suffix == 'iB' - if known: - byte = known.pop() - if all(byte == k for k in known): - suffix = 'i' + byte - for q in si_quantifiers: - yield find(path, stem % q[:2], - # Those don't (yet, v31) exist in CLDR, so we always fall back to: - q[0].upper() + suffix) - -def generateLocaleInfo(path): - if not path.endswith(".xml"): - return {} - - # skip legacy/compatibility ones - alias = findAlias(path) - if alias: - raise Error('Alias to "{}"'.format(alias)) - - def code(tag): - return findEntryInFile(path, 'identity/' + tag, attribute="type")[0] - - return _generateLocaleInfo(path, code('language'), code('script'), - code('territory'), code('variant')) - -def getNumberSystems(cache={}): - """Cached look-up of number system information. - - Pass no arguments. Returns a mapping from number system names to, - for each system, a mapping with keys 'digits', 'type' and 'id'. - Relies on global cldr_dir being set before it's first called.\n""" - if not cache: - for ns in findTagsInFile(os.path.join(cldr_dir, '..', 'supplemental', - 'numberingSystems.xml'), - 'numberingSystems'): - # ns has form: [u'numberingSystem', [(u'digits', u'0123456789'), (u'type', u'numeric'), (u'id', u'latn')]] - entry = dict(ns[1]) - name = entry[u'id'] - if u'digits' in entry and ord(entry[u'digits'][0]) > 0xffff: - # FIXME, QTBUG-69324: make this redundant: - # omit number system if zero doesn't fit in single-char16 UTF-16 :-( - sys.stderr.write('skipping number system "%s" [can\'t represent its zero, U+%X]\n' - % (name, ord(entry[u'digits'][0]))) - else: - cache[name] = entry - return cache - -def _generateLocaleInfo(path, language_code, script_code, country_code, variant_code=""): - if not path.endswith(".xml"): - return {} - - if language_code == 'root': - # just skip it - return {} - - # we do not support variants - # ### actually there is only one locale with variant: en_US_POSIX - # does anybody care about it at all? - if variant_code: - raise Error('We do not support variants ("{}")'.format(variant_code)) - - language_id = enumdata.languageCodeToId(language_code) - if language_id <= 0: - raiseUnknownCode(language_code, 'language') - - script_id = enumdata.scriptCodeToId(script_code) - if script_id == -1: - raiseUnknownCode(script_code, 'script') - - # we should handle fully qualified names with the territory - if not country_code: - return {} - country_id = enumdata.countryCodeToId(country_code) - if country_id <= 0: - raiseUnknownCode(country_code, 'country') - - # So we say we accept only those values that have "contributed" or - # "approved" resolution. see http://www.unicode.org/cldr/process.html - # But we only respect the resolution for new datas for backward - # compatibility. - draft = DraftResolution.contributed - - result = dict( - language=enumdata.language_list[language_id][0], - language_code=language_code, language_id=language_id, - script=enumdata.script_list[script_id][0], - script_code=script_code, script_id=script_id, - country=enumdata.country_list[country_id][0], - country_code=country_code, country_id=country_id, - variant_code=variant_code) - - (dir_name, file_name) = os.path.split(path) - def from_supplement(tag, - path=os.path.join(dir_name, '..', 'supplemental', - 'supplementalData.xml')): - return findTagsInFile(path, tag) - currencies = from_supplement('currencyData/region[iso3166=%s]' % country_code) - result['currencyIsoCode'] = '' - result['currencyDigits'] = 2 - result['currencyRounding'] = 1 - if currencies: - for e in currencies: - if e[0] == 'currency': - t = [x[1] == 'false' for x in e[1] if x[0] == 'tender'] - if t and t[0]: - pass - elif not any(x[0] == 'to' for x in e[1]): - result['currencyIsoCode'] = (x[1] for x in e[1] if x[0] == 'iso4217').next() - break - if result['currencyIsoCode']: - t = from_supplement("currencyData/fractions/info[iso4217=%s]" - % result['currencyIsoCode']) - if t and t[0][0] == 'info': - result['currencyDigits'] = (int(x[1]) for x in t[0][1] if x[0] == 'digits').next() - result['currencyRounding'] = (int(x[1]) for x in t[0][1] if x[0] == 'rounding').next() - numbering_system = None - try: - numbering_system = findEntry(path, "numbers/defaultNumberingSystem") - except Error: - pass - def findEntryDef(path, xpath, value=''): - try: - return findEntry(path, xpath) - except Error: - return value - def get_number_in_system(path, xpath, numbering_system): - if numbering_system: - try: - return findEntry(path, xpath + "[numberSystem=" + numbering_system + "]") - except Error: - # in CLDR 1.9 number system was refactored for numbers (but not for currency) - # so if previous findEntry doesn't work we should try this: - try: - return findEntry(path, xpath.replace("/symbols/", "/symbols[numberSystem=" + numbering_system + "]/")) - except Error: - # fallback to default - pass - return findEntry(path, xpath) - - result['decimal'] = get_number_in_system(path, "numbers/symbols/decimal", numbering_system) - result['group'] = get_number_in_system(path, "numbers/symbols/group", numbering_system) - assert result['decimal'] != result['group'] - result['list'] = get_number_in_system(path, "numbers/symbols/list", numbering_system) - result['percent'] = get_number_in_system(path, "numbers/symbols/percentSign", numbering_system) - try: - result['zero'] = getNumberSystems()[numbering_system][u"digits"][0] - except Exception as e: - sys.stderr.write("Native zero detection problem: %s\n" % repr(e)) - result['zero'] = get_number_in_system(path, "numbers/symbols/nativeZeroDigit", numbering_system) - result['minus'] = get_number_in_system(path, "numbers/symbols/minusSign", numbering_system) - result['plus'] = get_number_in_system(path, "numbers/symbols/plusSign", numbering_system) - result['exp'] = get_number_in_system(path, "numbers/symbols/exponential", numbering_system).lower() - result['quotationStart'] = findEntry(path, "delimiters/quotationStart") - result['quotationEnd'] = findEntry(path, "delimiters/quotationEnd") - result['alternateQuotationStart'] = findEntry(path, "delimiters/alternateQuotationStart") - result['alternateQuotationEnd'] = findEntry(path, "delimiters/alternateQuotationEnd") - result['listPatternPartStart'] = parse_list_pattern_part_format(findEntry(path, "listPatterns/listPattern/listPatternPart[start]")) - result['listPatternPartMiddle'] = parse_list_pattern_part_format(findEntry(path, "listPatterns/listPattern/listPatternPart[middle]")) - result['listPatternPartEnd'] = parse_list_pattern_part_format(findEntry(path, "listPatterns/listPattern/listPatternPart[end]")) - result['listPatternPartTwo'] = parse_list_pattern_part_format(findEntry(path, "listPatterns/listPattern/listPatternPart[2]")) - result['am'] = findEntry(path, "dates/calendars/calendar[gregorian]/dayPeriods/dayPeriodContext[format]/dayPeriodWidth[wide]/dayPeriod[am]", draft) - result['pm'] = findEntry(path, "dates/calendars/calendar[gregorian]/dayPeriods/dayPeriodContext[format]/dayPeriodWidth[wide]/dayPeriod[pm]", draft) - result['longDateFormat'] = convert_date(findEntry(path, "dates/calendars/calendar[gregorian]/dateFormats/dateFormatLength[full]/dateFormat/pattern")) - result['shortDateFormat'] = convert_date(findEntry(path, "dates/calendars/calendar[gregorian]/dateFormats/dateFormatLength[short]/dateFormat/pattern")) - result['longTimeFormat'] = convert_date(findEntry(path, "dates/calendars/calendar[gregorian]/timeFormats/timeFormatLength[full]/timeFormat/pattern")) - result['shortTimeFormat'] = convert_date(findEntry(path, "dates/calendars/calendar[gregorian]/timeFormats/timeFormatLength[short]/timeFormat/pattern")) - - endonym = None - if country_code and script_code: - endonym = findEntryDef(path, "localeDisplayNames/languages/language[type=%s_%s_%s]" % (language_code, script_code, country_code)) - if not endonym and script_code: - endonym = findEntryDef(path, "localeDisplayNames/languages/language[type=%s_%s]" % (language_code, script_code)) - if not endonym and country_code: - endonym = findEntryDef(path, "localeDisplayNames/languages/language[type=%s_%s]" % (language_code, country_code)) - if not endonym: - endonym = findEntryDef(path, "localeDisplayNames/languages/language[type=%s]" % (language_code)) - result['languageEndonym'] = endonym - result['countryEndonym'] = findEntryDef(path, "localeDisplayNames/territories/territory[type=%s]" % (country_code)) - - currency_format = get_number_in_system(path, "numbers/currencyFormats/currencyFormatLength/currencyFormat/pattern", numbering_system) - currency_format = parse_number_format(currency_format, result) - result['currencyFormat'] = currency_format[0] - result['currencyNegativeFormat'] = '' - if len(currency_format) > 1: - result['currencyNegativeFormat'] = currency_format[1] - - result['currencySymbol'] = '' - result['currencyDisplayName'] = '' - if result['currencyIsoCode']: - result['currencySymbol'] = findEntryDef(path, "numbers/currencies/currency[%s]/symbol" % result['currencyIsoCode']) - result['currencyDisplayName'] = ';'.join( - findEntryDef(path, 'numbers/currencies/currency[' + result['currencyIsoCode'] - + ']/displayName' + tail) - for tail in ['',] + [ - '[count=%s]' % x for x in ('zero', 'one', 'two', 'few', 'many', 'other') - ]) + ';' - - def findUnitDef(path, stem, fallback=''): - # The displayName for a quantified unit in en.xml is kByte - # instead of kB (etc.), so prefer any unitPattern provided: - for count in ('many', 'few', 'two', 'other', 'zero', 'one'): - try: - ans = findEntry(path, stem + 'unitPattern[count=%s]' % count) - except Error: - continue - - # TODO: epxloit count-handling, instead of discarding placeholders - if ans.startswith('{0}'): - ans = ans[3:].lstrip() - if ans: - return ans - - return findEntryDef(path, stem + 'displayName', fallback) - - # First without quantifier, then quantified each way: - result['byte_unit'] = findEntryDef( - path, 'units/unitLength[type=long]/unit[type=digital-byte]/displayName', - 'bytes') - stem = 'units/unitLength[type=short]/unit[type=digital-%sbyte]/' - known = [] # cases where we *do* have a given version: - result['byte_si_quantified'] = ';'.join(unit_quantifiers(findUnitDef, path, stem, 'B', known)) - # IEC 60027-2 - # http://physics.nist.gov/cuu/Units/binary.html - result['byte_iec_quantified'] = ';'.join(unit_quantifiers(findUnitDef, path, stem % '%sbi', 'iB', known)) - - # Used for month and day data: - namings = ( - ('standaloneLong', 'stand-alone', 'wide'), - ('standaloneShort', 'stand-alone', 'abbreviated'), - ('standaloneNarrow', 'stand-alone', 'narrow'), - ('long', 'format', 'wide'), - ('short', 'format', 'abbreviated'), - ('narrow', 'format', 'narrow'), - ) - - # Month names for 12-month calendars: - for cal in calendars: - stem = 'dates/calendars/calendar[' + cal + ']/months/' - for (key, mode, size) in namings: - prop = 'monthContext[' + mode + ']/monthWidth[' + size + ']/' - result[key + 'Months_' + cal] = ';'.join( - findEntry(path, stem + prop + "month[%d]" % i) - for i in range(1, 13)) + ';' - - # Day data (for Gregorian, at least): - stem = 'dates/calendars/calendar[gregorian]/days/' - days = ('sun', 'mon', 'tue', 'wed', 'thu', 'fri', 'sat') - for (key, mode, size) in namings: - prop = 'dayContext[' + mode + ']/dayWidth[' + size + ']/day' - result[key + 'Days'] = ';'.join( - findEntry(path, stem + prop + '[' + day + ']') - for day in days) + ';' - - return Locale(result) - -def integrateWeekData(filePath, locale_database): - if not filePath.endswith(".xml"): - return {} - - def lookup(key): - return findEntryInFile(filePath, key, attribute='territories')[0].split() - days = ('mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun') - - firstDayByCountryCode = {} - for day in days: - for countryCode in lookup('weekData/firstDay[day=%s]' % day): - firstDayByCountryCode[countryCode] = day - - weekendStartByCountryCode = {} - for day in days: - for countryCode in lookup('weekData/weekendStart[day=%s]' % day): - weekendStartByCountryCode[countryCode] = day - - weekendEndByCountryCode = {} - for day in days: - for countryCode in lookup('weekData/weekendEnd[day=%s]' % day): - weekendEndByCountryCode[countryCode] = day - - for (key, locale) in locale_database.iteritems(): - countryCode = locale.country_code - if countryCode in firstDayByCountryCode: - locale.firstDayOfWeek = firstDayByCountryCode[countryCode] - else: - locale.firstDayOfWeek = firstDayByCountryCode["001"] - - if countryCode in weekendStartByCountryCode: - locale.weekendStart = weekendStartByCountryCode[countryCode] - else: - locale.weekendStart = weekendStartByCountryCode["001"] - - if countryCode in weekendEndByCountryCode: - locale.weekendEnd = weekendEndByCountryCode[countryCode] - else: - locale.weekendEnd = weekendEndByCountryCode["001"] - -def splitLocale(name): - """Split name into (language, script, territory) triple as generator. - - Ignores any trailing fields (with a warning), leaves script (a capitalised - four-letter token) or territory (either a number or an all-uppercase token) - empty if unspecified, returns a single-entry generator if name is a single - tag (i.e. contains no underscores). Always yields 1 or 3 values, never 2.""" - tags = iter(name.split('_')) - yield tags.next() # Language - tag = tags.next() - - # Script is always four letters, always capitalised: - if len(tag) == 4 and tag[0].isupper() and tag[1:].islower(): - yield tag - try: - tag = tags.next() - except StopIteration: - tag = '' - else: - yield '' - - # Territory is upper-case or numeric: - if tag and tag.isupper() or tag.isdigit(): - yield tag - tag = '' - else: - yield '' - - # If nothing is left, StopIteration will avoid the warning: - tag = (tag if tag else tags.next(),) - sys.stderr.write('Ignoring unparsed cruft %s in %s\n' % ('_'.join(tag + tuple(tags)), name)) - -def _parseLocale(l): - language = "AnyLanguage" - script = "AnyScript" - country = "AnyCountry" - - if l == "und": - raise Error('We treat unknown locale like C') - - parsed = splitLocale(l) - language_code = parsed.next() - script_code = country_code = '' - try: - script_code, country_code = parsed - except ValueError: - pass - - if language_code != "und": - language_id = enumdata.languageCodeToId(language_code) - if language_id == -1: - raise Error('Unknown language code "{}"'.format(language_code)) - language = enumdata.language_list[language_id][0] - - if script_code: - script_id = enumdata.scriptCodeToId(script_code) - if script_id == -1: - raise Error('Unknown script code "{}"'.format(script_code)) - script = enumdata.script_list[script_id][0] - - if country_code: - country_id = enumdata.countryCodeToId(country_code) - if country_id == -1: - raise Error('Unknown country code "{}"'.format(country_code)) - country = enumdata.country_list[country_id][0] - - return (language, script, country) - -def likelySubtags(root, err): - skips = [] - for ns in findTagsInFile(os.path.join(root, 'supplemental', 'likelySubtags.xml'), "likelySubtags"): - tmp = {} - for data in ns[1:][0]: # ns looks like this: [u'likelySubtag', [(u'from', u'aa'), (u'to', u'aa_Latn_ET')]] - tmp[data[0]] = data[1] - - try: - from_language, from_script, from_country = _parseLocale(tmp[u"from"]) - to_language, to_script, to_country = _parseLocale(tmp[u"to"]) - except Error as e: - if (tmp['to'].startswith(tmp['from']) - and e.message == 'Unknown language code "{}"'.format(tmp['from'])): - skips.append(tmp['to']) - else: - sys.stderr.write('skipping likelySubtag "{}" -> "{}" ({})\n'.format( - tmp[u"from"], tmp[u"to"], e.message)) - continue - # substitute according to http://www.unicode.org/reports/tr35/#Likely_Subtags - if to_country == "AnyCountry" and from_country != to_country: - to_country = from_country - if to_script == "AnyScript" and from_script != to_script: - to_script = from_script - - yield ((from_language, from_script, from_country), - (to_language, to_script, to_country)) - if skips: - wrappedwarn(err, 'skipping likelySubtags (for unknown language codes): ', skips) +from cldr import CldrReader +from qlocalexml import QLocaleXmlWriter +from enumdata import language_list, script_list, country_list def usage(err, name, message = ''): - err.write("""Usage: {} <path-to-cldr-main> [out-file.xml] -""".format(name)) # TODO: expand + err.write("""Usage: {} path/to/cldr/common/main [out-file.xml] +""".format(name)) # TODO: expand command-line, improve help message if message: err.write('\n' + message + '\n') def main(args, out, err): - name = args.pop(0) + # TODO: make calendars a command-line option + calendars = ['gregorian', 'persian', 'islamic'] # 'hebrew' - if len(args) < 1: - usage(err, name) + # TODO: make argument parsing more sophisticated + name = args.pop(0) + if not args: + usage(name, err, 'Where is your CLDR data tree ?') return 1 - global cldr_dir - cldr_dir = args.pop(0) - if not os.path.isdir(cldr_dir): - usage(err, name, 'Where did you unpack the CLDR data files ?') + root = args.pop(0) + if not os.path.exists(os.path.join(root, 'common', 'main', 'root.xml')): + usage(name, err, + 'First argument is the root of the CLDR tree: found no common/main/root.xml under ' + + root) return 1 - if len(args) > 1: - usage(err, name, 'Too many arguments passed') + xml = args.pop(0) if args else None + if not xml or xml == '-': + emit = out + elif not xml.endswith('.xml'): + usage(name, err, 'Please use a .xml extension on your output file name, not ' + xml) return 1 - if args: - qxml = open(args.pop(0), 'w') else: - qxml = out - - getNumberSystems(cldr_dir) - cldr_files = os.listdir(cldr_dir) - locale_database = {} - - # see http://www.unicode.org/reports/tr35/tr35-info.html#Default_Content - defaultContent_locales = [] - for ns in findTagsInFile(os.path.join(cldr_dir, '..', 'supplemental', - 'supplementalMetadata.xml'), - 'metadata/defaultContent'): - for data in ns[1:][0]: - if data[0] == u"locales": - defaultContent_locales += data[1].split() - - skips = [] - for file in defaultContent_locales: try: - language_code, script_code, country_code = splitLocale(file) - except ValueError: - sys.stderr.write('skipping defaultContent locale "' + file + '" [neither two nor three tags]\n') - continue + emit = open(xml, 'w') + except IOError as e: + usage(name, err, 'Failed to open "{}" to write output to it\n'.format(xml)) + return 1 - if not (script_code or country_code): - sys.stderr.write('skipping defaultContent locale "' + file + '" [second tag is neither script nor territory]\n') - continue - - try: - l = _generateLocaleInfo(cldr_dir + "/" + file + ".xml", language_code, script_code, country_code) - if not l: - skips.append(file) - continue - except Error as e: - sys.stderr.write('skipping defaultContent locale "{}" ({})\n'.format(file, e.message)) - continue - - locale_database[(l.language_id, l.script_id, l.country_id, l.variant_code)] = l - - if skips: - wrappedwarn(err, 'skipping defaultContent locales [no locale info generated]: ', skips) - skips = [] - - for file in cldr_files: - try: - l = generateLocaleInfo(cldr_dir + "/" + file) - if not l: - skips.append(file) - continue - except Error as e: - sys.stderr.write('skipping file "{}" ({})\n'.format(file, e.message)) - continue - - locale_database[(l.language_id, l.script_id, l.country_id, l.variant_code)] = l - - if skips: - wrappedwarn(err, 'skipping files [no locale info generated]: ', skips) + if args: + usage(name, err, 'Too many arguments - excess: ' + ' '.join(args)) + return 1 - integrateWeekData(cldr_dir + "/../supplemental/supplementalData.xml", locale_database) - cldr_version = 'unknown' - with open(cldr_dir+"/../dtd/ldml.dtd", "r") as ldml: - for line in ldml: - if 'version cldrVersion CDATA #FIXED' in line: - cldr_version = line.split('"')[1] + # TODO - command line options to tune choice of grumble and whitter: + reader = CldrReader(root, err.write, err.write) + writer = QLocaleXmlWriter(emit.write) - xmlOut = QLocaleXmlWriter(qxml.write) - xmlOut.version(cldr_version) - xmlOut.enumData(enumdata.language_list, - enumdata.script_list, - enumdata.country_list) - xmlOut.likelySubTags(likelySubtags(os.path.split(cldr_dir)[0], err)) - xmlOut.locales(locale_database, calendars) - xmlOut.close() - if qxml is not out: - qxml.close() + writer.version(reader.root.cldrVersion) + writer.enumData(language_list, script_list, country_list) + writer.likelySubTags(reader.likelySubTags()) + writer.locales(reader.readLocales(calendars), calendars) + writer.close() return 0 if __name__ == '__main__': diff --git a/util/locale_database/ldml.py b/util/locale_database/ldml.py index 4aaa728a86..ff94f3da73 100644 --- a/util/locale_database/ldml.py +++ b/util/locale_database/ldml.py @@ -39,10 +39,12 @@ returned by minidom.parse() and their child-nodes: Node -- wraps any node in the DOM tree XmlScanner -- wraps the root element of a stand-alone XML file Supplement -- specializes XmlScanner for supplemental data files + LocaleScanner -- wraps a locale's inheritance-chain of file roots See individual classes for further detail. """ from localetools import Error +from dateconverter import convert_date class Node (object): """Wrapper for an arbitrary DOM node. @@ -51,11 +53,20 @@ class Node (object): nodes are returned wrapped as Node objects. A Node exposes the raw DOM node it wraps via its .dom attribute.""" - def __init__(self, elt): + def __init__(self, elt, draft = 0): """Wraps a DOM node for ease of access. - Single argument, elt, is the DOM node to wrap.""" + First argument, elt, is the DOM node to wrap. (Optional second + argument, draft, should only be supplied by this class's + creation of child nodes; it is the maximum draft score of any + ancestor of the new node.)""" self.dom = elt + try: + attr = elt.attributes['draft'].nodeValue + except KeyError: + self.draft = draft + else: + self.draft = max(draft, self.draftScore(attr)) def findAllChildren(self, tag, wanted = None): """All children that do have the given tag and attributes. @@ -65,34 +76,60 @@ class Node (object): Optional second argument, wanted, should either be None or map attribute names to the values they must have. Only child nodes - with these attributes set to the given values are yielded.""" + with thes attributes set to the given values are yielded.""" - cutoff = 4 # Only accept approved, for now for child in self.dom.childNodes: if child.nodeType != child.ELEMENT_NODE: continue if child.nodeName != tag: continue - try: - draft = child.attributes['draft'] - except KeyError: - pass - else: - if self.__draftScores.get(draft, 0) < cutoff: - continue - - if wanted is not None: + if wanted: try: - if wanted and any(child.attributes[k].nodeValue != v for k, v in wanted.items()): + if any(child.attributes[k].nodeValue != v + for k, v in wanted.items()): continue except KeyError: # Some wanted attribute is missing continue - yield Node(child) + yield Node(child, self.draft) + + def findUniqueChild(self, tag): + """Returns the single child with the given nodeName. + + Raises Error if there is no such child or there is more than + one.""" + seq = self.findAllChildren(tag) + try: + node = seq.next() + except StopIteration: + raise Error('No child found where one was expected', tag) + for it in seq: + raise Error('Many children found where only one was expected', tag) + return node + + @classmethod + def draftScore(cls, level): + """Maps draft level names to numeric scores. + + Single parameter, level, is the least sure value of the draft + attribute on a node that you're willing to accept; returns a + numeric value (lower is less drafty). - __draftScores = dict(true = 0, unconfirmed = 1, provisional = 2, - contributed = 3, approved = 4, false = 4) + Tempting as it is to insist on low draft scores, there are + many locales in which pretty much every leaf is + unconfirmed. It may make sense to actually check each + XmlScanner object, or each node in each LocaleScanner's nodes + list, to see what its distribution of draft level looks like, + so as to set the acceptable draft score for its elements + accordingly. However, for the moment, we mostly just accept + all elements, regardless of draft values (the one exception is + am/pm indicators).""" + return cls.__draftScores.get(level, 5) if level else 0 + + # Implementation details: + __draftScores = dict(true = 4, unconfirmed = 3, provisional = 2, + contributed = 1, approved = 0, false = 0) def _parseXPath(selector): # Split "tag[attr=val][...]" into tag-name and attribute mapping @@ -129,7 +166,6 @@ class XmlScanner (object): return elts class Supplement (XmlScanner): - # Replaces xpathlite.findTagsInFile() def find(self, xpath): elts = self.findNodes(xpath) for elt in _iterateEach(e.dom.childNodes if e.dom.childNodes else (e.dom,) @@ -138,3 +174,381 @@ class Supplement (XmlScanner): yield (elt.nodeName, dict((k, v if isinstance(v, basestring) else v.nodeValue) for k, v in elt.attributes.items())) + +class LocaleScanner (object): + def __init__(self, name, nodes, root): + self.name, self.nodes, self.base = name, nodes, root + + def find(self, xpath, draft = None): + tags = xpath.split('/') + while True: + replace = None + for elt in self.nodes: + for selector in tags: + tag, attrs = _parseXPath(selector) + for elt in elt.findAllChildren(tag, attrs): + if draft is None or elt.draft <= draft: + break # and process the next selector + else: + break # no child, try next elt in self.nodes + else: + # processed all selectors + try: + return elt.dom.firstChild.nodeValue + except (AttributeError, KeyError): + pass # move on to next elt in self.nodes + + # No match in self.nodes; check root + elt = self.base.root + for i, selector in enumerate(tags): + tag, attrs = _parseXPath(selector) + for alias in elt.findAllChildren('alias'): + if alias.dom.attributes['source'].nodeValue == 'locale': + replace = alias.dom.attributes['path'].nodeValue.split('/') + tags = self.__xpathJoin(tags[:i], replace, tags[i:]) + break + else: + for elt in elt.findAllChildren(tag, attrs): + if draft is None or elt.draft <= draft: + break # and process the next selector + else: + break + if replace: + break + else: + # processed all selectors + try: + return elt.dom.firstChild.nodeValue + except (AttributeError, KeyError): + # No match + pass + if not replace: + break + + sought = '/'.join(tags) + if sought != xpath: + sought += ' (for {})'.format(xpath) + raise Error('No {} in {}'.format(sought, self.name)) + + def findOr(self, xpath, fallback = ''): + """Use a fall-back value if we don't find data. + + Like find, but takes a fall-back value to return instead of + raising Error on failure.""" + try: + return self.find(xpath) + except Error: + return fallback + + def tagCodes(self): + """Yields four tag codes + + The tag codes are language, script, country and variant; an + empty value for any of them indicates that no value was + provided. The values are obtained from the primary file's + top-level <identity> element. An Error is raised if any + top-level <alias> element of this file has a non-empty source + attribute; that attribute value is mentioned in the error's + message.""" + root = self.nodes[0] + for alias in root.findAllChildren('alias'): + try: + source = alias.dom.attributes['source'].nodeValue + except (KeyError, AttributeError): + pass + else: + raise Error('Alias to {}'.format(source)) + + ids = root.findUniqueChild('identity') + for code in ('language', 'script', 'territory', 'variant'): + for node in ids.findAllChildren(code): + try: + yield node.dom.attributes['type'].nodeValue + except (KeyError, AttributeError): + pass + else: + break # only want one value for each code + else: # No value for this code, use empty + yield '' + + def currencyData(self, isoCode): + """Fetches currency data for this locale. + + Single argument, isoCode, is the ISO currency code for the + currency in use in the country. See also numericData, which + includes some currency formats. + """ + if isoCode: + stem = 'numbers/currencies/currency[{}]/'.format(isoCode) + symbol = self.findOr(stem + 'symbol') + name = ';'.join( + self.findOr(stem + 'displayName' + tail) + for tail in ('',) + tuple( + '[count={}]'.format(x) for x in ('zero', 'one', 'two', 'few', 'many', 'other') + )) + ';' + else: + symbol = name = '' + yield 'currencySymbol', symbol + yield 'currencyDisplayName', name + + def numericData(self, lookup, complain = lambda text: None): + """Generate assorted numeric data for the locale. + + First argument, lookup, is a callable that maps a numbering + system's name to certain data about the system, as a mapping; + we expect this to have u'digits' as a key. + """ + system = self.find('numbers/defaultNumberingSystem') + stem = 'numbers/symbols[numberSystem={}]/'.format(system) + decimal = self.find(stem + 'decimal') + group = self.find(stem + 'group') + assert decimal != group, (self.name, system, decimal) + yield 'decimal', decimal + yield 'group', group + yield 'percent', self.find(stem + 'percentSign') + yield 'list', self.find(stem + 'list') + # FIXME: don't lower-case: + yield 'exp', self.find(stem + 'exponential').lower() + + digits = lookup(system)['digits'] + assert len(digits) == 10 + zero = digits[0] + # Qt's number-formatting code assumes digits are consecutive: + assert all(ord(c) == i for i, c in enumerate(digits, ord(zero))) + yield 'zero', zero + + plus = self.find(stem + 'plusSign') + minus = self.find(stem + 'minusSign') + yield 'plus', plus + yield 'minus', minus + + # Currency formatting (currencyFormat may have a type field): + money = self.find('numbers/currencyFormats/currencyFormatLength/currencyFormat/pattern') + money = self.__currencyFormats(money, plus, minus) + yield 'currencyFormat', money.next() + neg = '' + for it in money: + assert not neg, 'There should be at most one more pattern' + neg = it + yield 'currencyNegativeFormat', neg + + def textPatternData(self): + for key in ('quotationStart', 'alternateQuotationEnd', + 'quotationEnd', 'alternateQuotationStart'): + yield key, self.find('delimiters/' + key) + + for key in ('start', 'middle', 'end'): + yield ('listPatternPart' + key.capitalize(), + self.__fromLdmlListPattern(self.find( + 'listPatterns/listPattern/listPatternPart[{}]'.format(key)))) + yield ('listPatternPartTwo', + self.__fromLdmlListPattern(self.find( + 'listPatterns/listPattern/listPatternPart[2]'))) + + stem = 'dates/calendars/calendar[gregorian]/' + # TODO: is wide really the right width to use here ? + # abbreviated might be an option ... or try both ? + meridiem = stem + 'dayPeriods/dayPeriodContext[format]/dayPeriodWidth[wide]/' + for key in ('am', 'pm'): + yield key, self.find(meridiem + 'dayPeriod[{}]'.format(key), + draft = Node.draftScore('contributed')) + + for pair in (('long', 'full'), ('short', 'short')): + for key in ('time', 'date'): + yield (pair[0] + key.capitalize() + 'Format', + convert_date(self.find( + stem + '{}Formats/{}FormatLength[{}]/{}Format/pattern'.format( + key, key, pair[1], key)))) + + def endonyms(self, language, script, country, variant): + # TODO: take variant into account ? + for seq in ((language, script, country), + (language, script), (language, country), (language,)): + if not all(seq): + continue + try: + yield ('languageEndonym', + self.find('localeDisplayNames/languages/language[{}]' + .format('_'.join(seq)))) + except Error: + pass + else: + break + else: + # grumble(failed to find endonym for language) + yield 'languageEndonym', '' + + yield ('countryEndonym', + self.findOr('localeDisplayNames/territories/territory[{}]' + .format(country))) + + def unitData(self): + yield ('byte_unit', + self.findOr('units/unitLength[long]/unit[digital-byte]/displayName', + 'bytes')) + + unit = self.__findUnit('', 'B') + cache = [] # Populated by the SI call, to give hints to the IEC call + yield ('byte_si_quantified', + ';'.join(self.__unitCount('', unit, cache))) + # IEC 60027-2 + # http://physics.nist.gov/cuu/Units/binary.html + yield ('byte_iec_quantified', + ';'.join(self.__unitCount('bi', 'iB', cache))) + + def calendarNames(self, calendars): + namings = self.__nameForms + for cal in calendars: + stem = 'dates/calendars/calendar[' + cal + ']/months/' + for key, mode, size in namings: + prop = 'monthContext[' + mode + ']/monthWidth[' + size + ']/' + yield (key + 'Months_' + cal, + ';'.join(self.find(stem + prop + 'month[{}]'.format(i)) + for i in range(1, 13)) + ';') + + # Day data (for Gregorian, at least): + stem = 'dates/calendars/calendar[gregorian]/days/' + days = ('sun', 'mon', 'tue', 'wed', 'thu', 'fri', 'sat') + for (key, mode, size) in namings: + prop = 'dayContext[' + mode + ']/dayWidth[' + size + ']/day' + yield (key + 'Days', + ';'.join(self.find(stem + prop + '[' + day + ']') + for day in days) + ';') + + # Implementation details + __nameForms = ( + ('standaloneLong', 'stand-alone', 'wide'), + ('standaloneShort', 'stand-alone', 'abbreviated'), + ('standaloneNarrow', 'stand-alone', 'narrow'), + ('long', 'format', 'wide'), + ('short', 'format', 'abbreviated'), + ('narrow', 'format', 'narrow'), + ) # Used for month and day names + + def __findUnit(self, keySuffix, quantify, fallback=''): + # The displayName for a quantified unit in en.xml is kByte + # (even for unitLength[narrow]) instead of kB (etc.), so + # prefer any unitPattern provided, but prune its placeholder: + for size in ('short', 'narrow'): # TODO: reverse order ? + stem = 'units/unitLength[{}]/unit[digital-{}byte]/'.format(size + keySuffix, quantify) + for count in ('many', 'few', 'two', 'other', 'zero', 'one'): + try: + ans = self.find(stem + 'unitPattern[count={}]'.format(count)) + except Error: + continue + + # TODO: do count-handling, instead of discarding placeholders + if False: # TODO: do it this way, instead ! + ans = ans.replace('{0}', '').strip() + elif ans.startswith('{0}'): + ans = ans[3:].lstrip() + if ans: + return ans + + try: + return self.find(stem + 'displayName') + except Error: + pass + + return fallback + + def __unitCount(self, keySuffix, suffix, cache, + # Stop at exa/exbi: 16 exbi = 2^{64} < zetta = + # 1000^7 < zebi = 2^{70}, the next quantifiers up: + siQuantifiers = ('kilo', 'mega', 'giga', 'tera', 'peta', 'exa')): + """Work out the unit quantifiers. + + Unfortunately, the CLDR data only go up to terabytes and we + want all the way to exabytes; but we can recognize the SI + quantifiers as prefixes, strip and identify the tail as the + localized translation for 'B' (e.g. French has 'octet' for + 'byte' and uses ko, Mo, Go, To from which we can extrapolate + Po, Eo). + + Should be called first for the SI quantifiers, with suffix = + 'B', then for the IEC ones, with suffix = 'iB'; the list cache + (initially empty before first call) is used to let the second + call know what the first learned about the localized unit. + """ + if suffix == 'iB': # second call, re-using first's cache + if cache: + byte = cache.pop() + if all(byte == k for k in cache): + suffix = 'i' + byte + for q in siQuantifiers: + # Those don't (yet, v36) exist in CLDR, so we always get the fall-back: + yield self.__findUnit(keySuffix, q[:2], q[0].upper() + suffix) + else: # first call + tail = suffix = suffix or 'B' + for q in siQuantifiers: + it = self.__findUnit(keySuffix, q) + # kB for kilobyte, in contrast with KiB for IEC: + q = q[0] if q == 'kilo' else q[0].upper() + if not it: + it = q + tail + elif it.startswith(q): + rest = it[1:] + tail = rest if all(rest == k for k in cache) else suffix + cache.append(rest) + yield it + + @staticmethod + def __currencyFormats(patterns, plus, minus): + for p in patterns.split(';'): + p = p.replace('0', '#').replace(',', '').replace('.', '') + try: + cut = p.find('#') + 1 + except ValueError: + pass + else: + p = p[:cut] + p[cut:].replace('#', '') + p = p.replace('#', "%1") + # According to http://www.unicode.org/reports/tr35/#Number_Format_Patterns + # there can be doubled or trippled currency sign, however none of the + # locales use that. + p = p.replace(u'\xa4', "%2") + # Single quote goes away, but double goes to single: + p = p.replace("''", '###').replace("'", '').replace('###', "'") + # Use number system's signs: + p = p.replace('+', plus).replace('-', minus) + yield p + + @staticmethod + def __fromLdmlListPattern(pattern): + # This is a very limited parsing of the format for list pattern part only. + return pattern.replace('{0}', '%1').replace('{1}', '%2').replace('{2}', '%3') + + @staticmethod + def __fromLdmlPath(seq): # tool function for __xpathJoin() + """Convert LDML's [@name='value'] to our [name=value] form.""" + for it in seq: + # First dismember it: + attrs = it.split('[') + tag = attrs.pop(0) + if not attrs: # Short-cut the easy case: + yield it + continue + + assert all(x.endswith(']') for x in attrs) + attrs = [x[:-1].split('=') for x in attrs] + # Then fix each attribute specification in it: + attrs = [(x[0][1:] if x[0].startswith('@') else x[0], + x[1][1:-1] if x[1].startswith("'") and x[1].endswith("'") else x[1]) + for x in attrs] + # Finally, put it all back together: + attrs = ['='.join(x) + ']' for x in attrs] + attrs.insert(0, tag) + yield '['.join(attrs) + + @classmethod + def __xpathJoin(cls, head, insert, tail): + """Join three lists of XPath selectors. + + Each of head, insert and tail is a sequence of selectors but + insert may start with some uses of '..', that we want to + resolve away, and may use LDML's attribute format, that we + want to convert to our format.""" + while insert and insert[0] == '..': + insert.pop(0) + head.pop() + return head + list(cls.__fromLdmlPath(insert)) + tail diff --git a/util/locale_database/qlocalexml2cpp.py b/util/locale_database/qlocalexml2cpp.py index 59161ed9d0..1938be19ea 100755 --- a/util/locale_database/qlocalexml2cpp.py +++ b/util/locale_database/qlocalexml2cpp.py @@ -480,7 +480,7 @@ def main(args, out, err): return 1 reader = QLocaleXmlReader(qlocalexml) - locale_map = dict(reader.loadLocaleMap(calendars, sys.stderr.write)) + locale_map = dict(reader.loadLocaleMap(calendars, err.write)) locale_keys = locale_map.keys() compareLocaleKeys.default_map = dict(reader.defaultMap()) diff --git a/util/locale_database/xpathlite.py b/util/locale_database/xpathlite.py deleted file mode 100644 index 3da8b24656..0000000000 --- a/util/locale_database/xpathlite.py +++ /dev/null @@ -1,284 +0,0 @@ -#!/usr/bin/env python -############################################################################# -## -## Copyright (C) 2016 The Qt Company Ltd. -## Contact: https://www.qt.io/licensing/ -## -## This file is part of the test suite of the Qt Toolkit. -## -## $QT_BEGIN_LICENSE:GPL-EXCEPT$ -## Commercial License Usage -## Licensees holding valid commercial Qt licenses may use this file in -## accordance with the commercial license agreement provided with the -## Software or, alternatively, in accordance with the terms contained in -## a written agreement between you and The Qt Company. For licensing terms -## and conditions see https://www.qt.io/terms-conditions. For further -## information use the contact form at https://www.qt.io/contact-us. -## -## GNU General Public License Usage -## Alternatively, this file may be used under the terms of the GNU -## General Public License version 3 as published by the Free Software -## Foundation with exceptions as appearing in the file LICENSE.GPL3-EXCEPT -## included in the packaging of this file. Please review the following -## information to ensure the GNU General Public License requirements will -## be met: https://www.gnu.org/licenses/gpl-3.0.html. -## -## $QT_END_LICENSE$ -## -############################################################################# - -import sys -import os -import xml.dom.minidom - -from localetools import Error - -class DraftResolution: - # See http://www.unicode.org/cldr/process.html for description - unconfirmed = 'unconfirmed' - provisional = 'provisional' - contributed = 'contributed' - approved = 'approved' - _values = { unconfirmed : 1, provisional : 2, contributed : 3, approved : 4 } - def __init__(self, resolution): - self.resolution = resolution - def toInt(self): - return DraftResolution._values[self.resolution] - -doc_cache = {} -def parseDoc(file): - if not doc_cache.has_key(file): - doc_cache[file] = xml.dom.minidom.parse(file) - return doc_cache[file] - -def findChild(parent, tag_name, arg_name=None, arg_value=None, draft=None): - for node in parent.childNodes: - if node.nodeType != node.ELEMENT_NODE: - continue - if node.nodeName != tag_name: - continue - if arg_value: - if not node.attributes.has_key(arg_name): - continue - if node.attributes[arg_name].nodeValue != arg_value: - continue - if draft: - if not node.attributes.has_key('draft'): - # if draft is not specified then it's approved - return node - value = node.attributes['draft'].nodeValue - value = DraftResolution(value).toInt() - exemplar = DraftResolution(draft).toInt() - if exemplar > value: - continue - return node - return False - -def codeMapsFromFile(file): - """Extract mappings of language, script and country codes to names. - - The file shall typically be common/main/en.xml, which contains a - localeDisplayNames element with children languages, scripts and - territories; each element in each of these has a code as its type - attribute and its name as element content. This returns a mapping - withe keys 'language', 'script' and 'country', each of which - has, as value, a mapping of the relevant codes to names. - """ - parent = findChild(findChild(parseDoc(file), 'ldml'), 'localeDisplayNames') - keys, result = {'languages': 'language', 'scripts': 'script', 'territories': 'country'}, {} - for src, dst in keys.items(): - child = findChild(parent, src) - data = result[dst] = {} - for elt in child.childNodes: - if elt.attributes and elt.attributes.has_key('type'): - key, value = elt.attributes['type'].value, elt.childNodes[0].wholeText - # Don't over-write previously-read data for an alt form: - if elt.attributes.has_key('alt') and data.has_key(key): - continue - data[key] = value - - return result - -def findTagsInFile(file, path): - doc = parseDoc(file) - - elt = doc.documentElement - tag_spec_list = path.split("/") - last_entry = None - for tag_spec in tag_spec_list: - tag_name = tag_spec - arg_name = 'type' - arg_value = '' - left_bracket = tag_spec.find('[') - if left_bracket != -1: - tag_name = tag_spec[:left_bracket] - arg_value = tag_spec[left_bracket+1:-1].split("=") - if len(arg_value) == 2: - arg_name = arg_value[0] - arg_value = arg_value[1] - else: - arg_value = arg_value[0] - elt = findChild(elt, tag_name, arg_name, arg_value) - if not elt: - return None - ret = [] - if elt.childNodes: - for node in elt.childNodes: - if node.attributes: - element = [node.nodeName, None] - element[1] = node.attributes.items() - ret.append(element) - else: - if elt.attributes: - element = [elt.nodeName, None] - element[1] = elt.attributes.items() - ret.append(element) - return ret - -def _findEntryInFile(file, path, draft=None, attribute=None): - doc = parseDoc(file) - - elt = doc.documentElement - tag_spec_list = path.split("/") - last_entry = None - for i in range(len(tag_spec_list)): - tag_spec = tag_spec_list[i] - tag_name = tag_spec - arg_name = 'type' - arg_value = '' - left_bracket = tag_spec.find('[') - if left_bracket != -1: - tag_name = tag_spec[:left_bracket] - arg_value = tag_spec[left_bracket+1:-1].split("=") - if len(arg_value) == 2: - arg_name = arg_value[0].replace("@", "").replace("'", "") - arg_value = arg_value[1] - else: - arg_value = arg_value[0] - alias = findChild(elt, 'alias') - if alias and alias.attributes['source'].nodeValue == 'locale': - path = alias.attributes['path'].nodeValue - aliaspath = tag_spec_list[:i] + path.split("/") - def resolve(x, y): - if y == '..': - return x[:-1] - return x + [y] - # resolve all dot-dot parts of the path - aliaspath = reduce(resolve, aliaspath, []) - # remove attribute specification that our xpathlite doesnt support - aliaspath = map(lambda x: x.replace("@type=", "").replace("'", ""), aliaspath) - # append the remaining path - aliaspath = aliaspath + tag_spec_list[i:] - aliaspath = "/".join(aliaspath) - # "locale" aliases are special - we need to start lookup from scratch - return (None, aliaspath) - elt = findChild(elt, tag_name, arg_name, arg_value, draft) - if not elt: - return ("", None) - if attribute is not None: - if elt.attributes.has_key(attribute): - return (elt.attributes[attribute].nodeValue, None) - return (None, None) - try: - return (elt.firstChild.nodeValue, None) - except: - pass - return (None, None) - -def findAlias(file): - doc = parseDoc(file) - - alias_elt = findChild(doc.documentElement, "alias") - if not alias_elt: - return False - if not alias_elt.attributes.has_key('source'): - return False - return alias_elt.attributes['source'].nodeValue - -lookup_chain_cache = {} -parent_locales = {} -def _fixedLookupChain(dirname, name): - if lookup_chain_cache.has_key(name): - return lookup_chain_cache[name] - - # see http://www.unicode.org/reports/tr35/#Parent_Locales - if not parent_locales: - for ns in findTagsInFile(dirname + "/../supplemental/supplementalData.xml", "parentLocales"): - tmp = {} - parent_locale = "" - for data in ns[1:][0]: # ns looks like this: [u'parentLocale', [(u'parent', u'root'), (u'locales', u'az_Cyrl bs_Cyrl en_Dsrt ..')]] - tmp[data[0]] = data[1] - if data[0] == u"parent": - parent_locale = data[1] - parent_locales[parent_locale] = tmp[u"locales"].split(" ") - - items = name.split("_") - # split locale name into items and iterate through them from back to front - # example: az_Latn_AZ => [az_Latn_AZ, az_Latn, az] - items = list(reversed(map(lambda x: "_".join(items[:x+1]), range(len(items))))) - - for i in range(len(items)): - item = items[i] - for parent_locale in parent_locales.keys(): - for locale in parent_locales[parent_locale]: - if item == locale: - if parent_locale == u"root": - items = items[:i+1] - else: - items = items[:i+1] + _fixedLookupChain(dirname, parent_locale) - lookup_chain_cache[name] = items - return items - - lookup_chain_cache[name] = items - return items - -def _findEntry(base, path, draft=None, attribute=None): - if base.endswith(".xml"): - base = base[:-4] - (dirname, filename) = os.path.split(base) - - items = _fixedLookupChain(dirname, filename) - for item in items: - file = dirname + "/" + item + ".xml" - if os.path.isfile(file): - alias = findAlias(file) - if alias: - # if alias is found we should follow it and stop processing current file - # see http://www.unicode.org/reports/tr35/#Common_Elements - aliasfile = os.path.dirname(file) + "/" + alias + ".xml" - if not os.path.isfile(aliasfile): - raise Error("findEntry: fatal error: found an alias '%s' to '%s', but the alias file couldn't be found" % (filename, alias)) - # found an alias, recurse into parsing it - result = _findEntry(aliasfile, path, draft, attribute) - return result - (result, aliaspath) = _findEntryInFile(file, path, draft, attribute) - if aliaspath: - # start lookup again because of the alias source="locale" - return _findEntry(base, aliaspath, draft, attribute) - if result: - return result - return None - -def findEntry(base, path, draft=None, attribute=None): - file = base - if base.endswith(".xml"): - file = base - base = base[:-4] - else: - file = base + ".xml" - (dirname, filename) = os.path.split(base) - - result = None - while path: - result = _findEntry(base, path, draft, attribute) - if result: - return result - (result, aliaspath) = _findEntryInFile(dirname + "/root.xml", path, draft, attribute) - if result: - return result - if not aliaspath: - raise Error("findEntry: fatal error: %s: cannot find key %s" % (filename, path)) - path = aliaspath - - return result - |