summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--util/locale_database/cldr.py498
-rwxr-xr-xutil/locale_database/cldr2qlocalexml.py636
-rw-r--r--util/locale_database/ldml.py450
-rwxr-xr-xutil/locale_database/qlocalexml2cpp.py2
-rw-r--r--util/locale_database/xpathlite.py284
5 files changed, 972 insertions, 898 deletions
diff --git a/util/locale_database/cldr.py b/util/locale_database/cldr.py
index 7890adf307..94459b9e3f 100644
--- a/util/locale_database/cldr.py
+++ b/util/locale_database/cldr.py
@@ -27,7 +27,8 @@
#############################################################################
"""Digesting the CLDR's data.
-Provides two class:
+Provides two classes:
+ CldrReader -- driver for reading CLDR data
CldrAccess -- used by the reader to access the tree of data files
The former should normally be all you need to access.
@@ -38,9 +39,206 @@ from xml.dom import minidom
from weakref import WeakValueDictionary as CacheDict
import os
-from localetools import Error
-from ldml import Node, Supplement
+from ldml import Error, Node, XmlScanner, Supplement, LocaleScanner
+from qlocalexml import Locale
+class CldrReader (object):
+ def __init__(self, root, grumble = lambda msg: None, whitter = lambda msg: None):
+ """Set up a reader object for reading CLDR data.
+
+ Single parameter, root, is the file-system path to the root of
+ the unpacked CLDR archive; its common/ sub-directory should
+ contain dtd/, main/ and supplemental/ sub-directories.
+
+ Optional second argument, grumble, is a callable that logs
+ warnings and complaints, e.g. sys.stderr.write would be a
+ suitable callable. The default is a no-op that ignores its
+ single argument. Optional third argument is similar, used for
+ less interesting output; pass sys.stderr.write for it for
+ verbose output."""
+ self.root = CldrAccess(root)
+ self.whitter, self.grumble = whitter, grumble
+
+ def likelySubTags(self):
+ """Generator for likely subtag information.
+
+ Yields pairs (have, give) of 4-tuples; if what you have
+ matches the left member, giving the right member is probably
+ sensible. Each 4-tuple's entries are the full names of a
+ language, a script, a country (strictly territory) and a
+ variant (currently ignored)."""
+ skips = []
+ for got, use in self.root.likelySubTags():
+ try:
+ have = self.__parseTags(got)
+ give = self.__parseTags(use)
+ except Error as e:
+ if ((use.startswith(got) or got.startswith('und_'))
+ and e.message.startswith('Unknown ') and ' code ' in e.message):
+ skips.append(use)
+ else:
+ self.grumble('Skipping likelySubtag "{}" -> "{}" ({})\n'.format(got, use, e.message))
+ continue
+ if all(code.startswith('Any') and code[3].isupper() for code in have[:-1]):
+ continue
+
+ give = (give[0],
+ # Substitute according to http://www.unicode.org/reports/tr35/#Likely_Subtags
+ have[1] if give[1] == 'AnyScript' else give[1],
+ have[2] if give[2] == 'AnyCountry' else give[2],
+ give[3]) # AnyVariant similarly ?
+
+ yield have, give
+
+ if skips:
+ # TODO: look at LDML's reserved locale tag names; they
+ # show up a lot in this, and may be grounds for filtering
+ # more out.
+ pass # self.__wrapped(self.whitter, 'Skipping likelySubtags (for unknown codes): ', skips)
+
+ def readLocales(self, calendars = ('gregorian',)):
+ locales = tuple(self.__allLocales(calendars))
+ return dict(((k.language_id, k.script_id, k.country_id, k.variant_code),
+ k) for k in locales)
+
+ def __allLocales(self, calendars):
+ def skip(locale, reason):
+ return 'Skipping defaultContent locale "{}" ({})\n'.format(locale, reason)
+
+ for locale in self.root.defaultContentLocales:
+ try:
+ language, script, country, variant = self.__splitLocale(locale)
+ except ValueError:
+ self.whitter(skip(locale, 'only language tag'))
+ continue
+
+ if not (script or country):
+ self.grumble(skip(locale, 'second tag is neither script nor territory'))
+ continue
+
+ if not (language and country):
+ continue
+
+ try:
+ yield self.__getLocaleData(self.root.locale(locale), calendars,
+ language, script, country, variant)
+ except Error as e:
+ self.grumble(skip(locale, e.message))
+
+ for locale in self.root.fileLocales:
+ try:
+ chain = self.root.locale(locale)
+ language, script, country, variant = chain.tagCodes()
+ assert language
+ # TODO: this skip should probably be based on likely
+ # sub-tags, instead of empty country: if locale has a
+ # likely-subtag expansion, that's what QLocale uses,
+ # and we'll be saving its data for the expanded locale
+ # anyway, so don't need to record it for itself.
+ # See also QLocaleXmlReader.loadLocaleMap's grumble.
+ if not country:
+ continue
+ yield self.__getLocaleData(chain, calendars, language, script, country, variant)
+ except Error as e:
+ self.grumble('Skipping file locale "{}" ({})\n'.format(locale, e.message))
+
+ import textwrap
+ @staticmethod
+ def __wrapped(writer, prefix, tokens, wrap = textwrap.wrap):
+ writer('\n'.join(wrap(prefix + ', '.join(tokens),
+ subsequent_indent=' ', width=80)) + '\n')
+ del textwrap
+
+ def __parseTags(self, locale):
+ tags = self.__splitLocale(locale)
+ language = tags.next()
+ script = country = variant = ''
+ try:
+ script, country, variant = tags
+ except ValueError:
+ pass
+ return tuple(p[1] for p in self.root.codesToIdName(language, script, country, variant))
+
+ def __splitLocale(self, name):
+ """Generate (language, script, territory, variant) from a locale name
+
+ Ignores any trailing fields (with a warning), leaves script (a
+ capitalised four-letter token), territory (either a number or
+ an all-uppercase token) or variant (upper case and digits)
+ empty if unspecified. Only generates one entry if name is a
+ single tag (i.e. contains no underscores). Always yields 1 or
+ 4 values, never 2 or 3."""
+ tags = iter(name.split('_'))
+ yield tags.next() # Language
+ tag = tags.next() # may raise StopIteration
+
+ # Script is always four letters, always capitalised:
+ if len(tag) == 4 and tag[0].isupper() and tag[1:].islower():
+ yield tag
+ try:
+ tag = tags.next()
+ except StopIteration:
+ tag = ''
+ else:
+ yield ''
+
+ # Territory is upper-case or numeric:
+ if tag and tag.isupper() or tag.isdigit():
+ yield tag
+ try:
+ tag = tags.next()
+ except StopIteration:
+ tag = ''
+ else:
+ yield ''
+
+ # Variant can be any mixture of upper-case and digits.
+ if tag and all(c.isupper() or c.isdigit() for c in tag):
+ yield tag
+ tag = ''
+ else:
+ yield ''
+
+ # If nothing is left, StopIteration will avoid the warning:
+ if not tag:
+ tag = tags.next()
+ self.grumble('Ignoring unparsed cruft {} in {}\n'.format('_'.join(tag + tuple(tags)), name))
+
+ def __getLocaleData(self, scan, calendars, language, script, country, variant):
+ ids, names = zip(*self.root.codesToIdName(language, script, country, variant))
+ assert ids[0] > 0 and ids[2] > 0, (language, script, country, variant)
+ locale = Locale(
+ language = names[0], language_code = language, language_id = ids[0],
+ script = names[1], script_code = script, script_id = ids[1],
+ country = names[2], country_code = country, country_id = ids[2],
+ variant_code = variant)
+
+ firstDay, weStart, weEnd = self.root.weekData(country)
+ assert all(day in ('mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun')
+ for day in (firstDay, weStart, weEnd))
+
+ locale.update(firstDayOfWeek = firstDay,
+ weekendStart = weStart,
+ weekendEnd = weEnd)
+
+ iso, digits, rounding = self.root.currencyData(country)
+ locale.update(currencyIsoCode = iso,
+ currencyDigits = int(digits),
+ currencyRounding = int(rounding))
+
+ locale.update(scan.currencyData(iso))
+ locale.update(scan.numericData(self.root.numberSystem, self.whitter))
+ locale.update(scan.textPatternData())
+ locale.update(scan.endonyms(language, script, country, variant))
+ locale.update(scan.unitData()) # byte, kB, MB, GB, ..., KiB, MiB, GiB, ...
+ locale.update(scan.calendarNames(calendars)) # Names of days and months
+
+ return locale
+
+# Note: various caches assume this class is a singleton, so the
+# "default" value for a parameter no caller should pass can serve as
+# the cache. If a process were to instantiate this class with distinct
+# roots, each cache would be filled by the first to need it !
class CldrAccess (object):
def __init__(self, root):
"""Set up a master object for accessing CLDR data.
@@ -50,6 +248,12 @@ class CldrAccess (object):
contain dtd/, main/ and supplemental/ sub-directories."""
self.root = root
+ def xml(self, *path):
+ """Load a single XML file and return its root element as an XmlScanner.
+
+ The path is interpreted relative to self.root"""
+ return XmlScanner(Node(self.__xml(path)))
+
def supplement(self, name):
"""Loads supplemental data as a Supplement object.
@@ -57,6 +261,117 @@ class CldrAccess (object):
"""
return Supplement(Node(self.__xml(('common', 'supplemental', name))))
+ def locale(self, name):
+ """Loads all data for a locale as a LocaleScanner object.
+
+ The name should be a locale name; adding suffix '.xml' to it
+ should usually yield a file in common/main/. The returned
+ LocaleScanner object packages this file along with all those
+ from which it inherits; its methods know how to handle that
+ inheritance, where relevant."""
+ return LocaleScanner(name, self.__localeRoots(name), self.__rootLocale)
+
+ @property
+ def fileLocales(self, joinPath = os.path.join, listDirectory = os.listdir,
+ splitExtension = os.path.splitext):
+ """Generator for locale IDs seen in file-names.
+
+ All *.xml other than root.xml in common/main/ are assumed to
+ identify locales."""
+ for name in listDirectory(joinPath(self.root, 'common', 'main')):
+ stem, ext = splitExtension(name)
+ if ext == '.xml' and stem != 'root':
+ yield stem
+
+ @property
+ def defaultContentLocales(self):
+ """Generator for the default content locales."""
+ for name, attrs in self.supplement('supplementalMetadata.xml').find('metadata/defaultContent'):
+ try:
+ locales = attrs['locales']
+ except KeyError:
+ pass
+ else:
+ for locale in locales.split():
+ yield locale
+
+ def likelySubTags(self):
+ for ignore, attrs in self.supplement('likelySubtags.xml').find('likelySubtags'):
+ yield attrs['from'], attrs['to']
+
+ def numberSystem(self, system):
+ """Get a description of a numbering system.
+
+ Returns a mapping, with keys u'digits', u'type' and u'id'; the
+ value for this last is system. Raises KeyError for unknown
+ number system, ldml.Error on failure to load data."""
+ try:
+ return self.__numberSystems[system]
+ except KeyError:
+ raise Error('Unsupported number system: {}'.format(system))
+
+ def weekData(self, country):
+ """Data on the weekly cycle.
+
+ Returns a triple (W, S, E) of en's short names for week-days;
+ W is the first day of the week, S the start of the week-end
+ and E the end of the week-end. Where data for a country is
+ unavailable, the data for CLDR's territory 001 (The World) is
+ used."""
+ try:
+ return self.__weekData[country]
+ except KeyError:
+ return self.__weekData['001']
+
+ def currencyData(self, country):
+ """Returns currency data for the given country code.
+
+ Return value is a tuple (ISO4217 code, digit count, rounding
+ mode). If CLDR provides no data for this country, ('', 2, 1)
+ is the default result.
+ """
+ try:
+ return self.__currencyData[country]
+ except KeyError:
+ return '', 2, 1
+
+ def codesToIdName(self, language, script, country, variant = ''):
+ """Maps each code to the appropriate ID and name.
+
+ Returns a 4-tuple of (ID, name) pairs corresponding to the
+ language, script, country and variant given. Raises a
+ suitable error if any of them is unknown, indicating all that
+ are unknown plus suitable names for any that could sensibly be
+ added to enumdata.py to make them known.
+
+ Until we implement variant support (QTBUG-81051), the fourth
+ member of the returned tuple is always 0 paired with a string
+ that should not be used."""
+ enum = self.__enumMap
+ try:
+ return (enum('language')[language],
+ enum('script')[script],
+ enum('country')[country],
+ enum('variant')[variant])
+ except KeyError:
+ pass
+
+ parts, values = [], [language, script, country, variant]
+ for index, key in enumerate(('language', 'script', 'country', 'variant')):
+ naming, enums = self.__codeMap(key), enum(key)
+ value = values[index]
+ if value not in enums:
+ text = '{} code {}'.format(key, value)
+ name = naming.get(value)
+ if name and value != 'POSIX':
+ text += u' (could add {})'.format(name)
+ parts.append(text)
+ if len(parts) > 1:
+ parts[-1] = 'and ' + parts[-1]
+ assert parts
+ raise Error('Unknown ' + ', '.join(parts),
+ language, script, country, variant)
+
def readWindowsTimeZones(self, lookup): # For use by cldr2qtimezone.py
"""Digest CLDR's MS-Win time-zone name mapping.
@@ -139,11 +454,97 @@ class CldrAccess (object):
return open(joinPath(self.root, *path))
@property
+ def __rootLocale(self, cache = []):
+ if not cache:
+ cache.append(self.xml('common', 'main', 'root.xml'))
+ return cache[0]
+
+ @property
def __supplementalData(self, cache = []):
if not cache:
cache.append(self.supplement('supplementalData.xml'))
return cache[0]
+ @property
+ def __numberSystems(self, cache = {}, joinPath=os.path.join):
+ if not cache:
+ for ignore, attrs in self.supplement('numberingSystems.xml').find('numberingSystems'):
+ if ord(attrs.get('digits', u'\x10000')[0]) > 0xffff:
+ # FIXME, QTBUG-69324: make this redundant:
+ # omit number system if zero doesn't fit in single-char16 UTF-16 :-(
+ continue
+
+ cache[attrs['id']] = attrs
+ assert cache
+ return cache
+
+ @property
+ def __weekData(self, cache = {}):
+ if not cache:
+ firstDay, weStart, weEnd = self.__getWeekData()
+ # Massage those into an easily-consulted form:
+ # World defaults given for code '001':
+ mon, sat, sun = firstDay['001'], weStart['001'], weEnd['001']
+ lands = set(firstDay) | set(weStart) | set(weEnd)
+ cache.update((land,
+ (firstDay.get(land, mon), weStart.get(land, sat), weEnd.get(land, sun)))
+ for land in lands)
+ assert cache
+ return cache
+
+ def __getWeekData(self):
+ """Scan for data on the weekly cycle.
+
+ Yields three mappings from locales to en's short names for
+ week-days; if a locale isn't a key of a given mapping, it
+ should use the '001' (world) locale's value. The first mapping
+ gives the day on which the week starts, the second gives the
+ day on which the week-end starts, the third gives the last day
+ of the week-end."""
+ source = self.__supplementalData
+ for key in ('firstDay', 'weekendStart', 'weekendEnd'):
+ result = {}
+ for ignore, attrs in source.find('weekData/' + key):
+ assert ignore == key
+ day = attrs['day']
+ assert day in ('mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun'), day
+ if 'alt' in attrs:
+ continue
+ for loc in attrs.get('territories', '').split():
+ result[loc] = day
+ yield result
+
+ @property
+ def __currencyData(self, cache = {}):
+ if not cache:
+ source = self.__supplementalData
+ for elt in source.findNodes('currencyData/region'):
+ iso, digits, rounding = '', 2, 1
+ try:
+ country = elt.dom.attributes['iso3166'].nodeValue
+ except KeyError:
+ continue
+ for child in elt.findAllChildren('currency'):
+ try:
+ if child.dom.attributes['tender'].nodeValue == 'false':
+ continue
+ except KeyError:
+ pass
+ try:
+ child.dom.attributes['to'] # Is set if this element has gone out of date.
+ except KeyError:
+ iso = child.dom.attributes['iso4217'].nodeValue
+ break
+ if iso:
+ for tag, data in source.find(
+ 'currencyData/fractions/info[iso4217={}]'.format(iso)):
+ digits = data['digits']
+ rounding = data['rounding']
+ cache[country] = iso, digits, rounding
+ assert cache
+
+ return cache
+
def __scanLdmlDtd(self, joinPath = os.path.join):
"""Scan the LDML DTD, record CLDR version."""
with self.__open(('common', 'dtd', 'ldml.dtd')) as dtd:
@@ -151,7 +552,8 @@ class CldrAccess (object):
if line.startswith('<!ATTLIST '):
parts = line.split()
if parts[1:5] == ['version', 'cldrVersion', 'CDATA', '#FIXED']:
- # parts[5] is the version, in quotes, although the final > might be stuck on its end:
+ # parts[5] is the version, in quotes, maybe
+ # with a final > attached to its end:
self.__cldrVersion = parts[5].split('"')[1]
break
@@ -178,5 +580,93 @@ class CldrAccess (object):
return cache[key]
+ def __codeMap(self, key, cache = {},
+ # Maps our name for it to CLDR's name:
+ naming = {'language': 'languages', 'script': 'scripts',
+ 'country': 'territories', 'variant': 'variants'}):
+ if not cache:
+ root = self.xml('common', 'main', 'en.xml').root.findUniqueChild('localeDisplayNames')
+ for dst, src in naming.items():
+ cache[dst] = dict(self.__codeMapScan(root.findUniqueChild(src)))
+ assert cache
+
+ return cache[key]
+
+ def __codeMapScan(self, node):
+ """Get mapping from codes to element values.
+
+ Passed in node is a <languages>, <scripts>, <territories> or
+ <variants> node, each child of which is a <language>,
+ <script>, <territory> or <variant> node as appropriate, whose
+ type is a code (of the appropriate flavour) and content is its
+ full name. In some cases, two child nodes have the same type;
+ in these cases, one always has an alt attribute and we should
+ prefer the other. Yields all such type, content pairs found
+ in node's children (skipping any with an alt attribute, if
+ their type has been seen previously)."""
+ seen = set()
+ for elt in node.dom.childNodes:
+ try:
+ key, value = elt.attributes['type'].nodeValue, elt.childNodes[0].wholeText
+ except (KeyError, ValueError, TypeError):
+ pass
+ else:
+ if key not in seen or not elt.attributes.has_key('alt'):
+ yield key, value
+ seen.add(key)
+
+ # CLDR uses inheritance between locales to save repetition:
+ def __parentLocale(self, name, cache = {}):
+ # see http://www.unicode.org/reports/tr35/#Parent_Locales
+ if not cache:
+ for tag, attrs in self.__supplementalData.find('parentLocales'):
+ parent = attrs.get('parent', '')
+ for child in attrs['locales'].split():
+ cache[child] = parent
+ assert cache
+
+ return cache[name]
+
+ def __localeAsDoc(self, name, aliasFor = None,
+ joinPath = os.path.join, exists = os.path.isfile):
+ path = ('common', 'main', name + '.xml')
+ if exists(joinPath(self.root, *path)):
+ elt = self.__xml(path)
+ for child in Node(elt).findAllChildren('alias'):
+ try:
+ alias = child.dom.attributes['source'].nodeValue
+ except (KeyError, AttributeError):
+ pass
+ else:
+ return self.__localeAsDoc(alias, aliasFor or name)
+ # No alias child with a source:
+ return elt
+
+ if aliasFor:
+ raise Error('Fatal error: found an alias "{}" -> "{}", but found no file for the alias'
+ .format(aliasFor, name))
+
+ def __scanLocaleRoots(self, name):
+ while name and name != 'root':
+ doc = self.__localeAsDoc(name)
+ if doc is not None:
+ yield Node(doc)
+
+ try:
+ name = self.__parentLocale(name)
+ except KeyError:
+ try:
+ name, tail = name.rsplit('_', 1)
+ except ValueError: # No tail to discard: we're done
+ break
+
+ class __Seq (list): pass # No weakref for tuple and list, but list sub-class is ok.
+ def __localeRoots(self, name, cache = CacheDict()):
+ try:
+ chain = cache[name]
+ except KeyError:
+ cache[name] = chain = self.__Seq(self.__scanLocaleRoots(name))
+ return chain
+
# Unpolute the namespace: we don't need to export these.
del minidom, CacheDict, os
diff --git a/util/locale_database/cldr2qlocalexml.py b/util/locale_database/cldr2qlocalexml.py
index 41795ff634..b28dcecc45 100755
--- a/util/locale_database/cldr2qlocalexml.py
+++ b/util/locale_database/cldr2qlocalexml.py
@@ -2,7 +2,7 @@
# coding=utf8
#############################################################################
##
-## Copyright (C) 2018 The Qt Company Ltd.
+## Copyright (C) 2020 The Qt Company Ltd.
## Contact: https://www.qt.io/licensing/
##
## This file is part of the test suite of the Qt Toolkit.
@@ -31,15 +31,17 @@
The CLDR data can be downloaded from CLDR_, which has a sub-directory
for each version; you need the ``core.zip`` file for your version of
-choice (typically the latest). This script has had updates to cope up
-to v35; for later versions, we may need adaptations. Unpack the
+choice (typically the latest). This script has had updates to cope up
+to v35; for later versions, we may need adaptations. Unpack the
downloaded ``core.zip`` and check it has a common/main/ sub-directory:
-pass the path of that sub-directory to this script as its single
-command-line argument. Save its standard output (but not error) to a
-file for later processing by ``./qlocalexml2cpp.py``
+pass the path of that root of the download to this script as its first
+command-line argument. Pass the name of the file in which to write
+output as the second argument; either omit it or use '-' to select the
+standard output. This file is the input needed by
+``./qlocalexml2cpp.py``
When you update the CLDR data, be sure to also update
-src/corelib/text/qt_attribution.json's entry for unicode-cldr. Check
+src/corelib/text/qt_attribution.json's entry for unicode-cldr. Check
this script's output for unknown language, country or script messages;
if any can be resolved, use their entry in common/main/en.xml to
append new entries to enumdata.py's lists and update documentation in
@@ -53,610 +55,62 @@ time zone names; see cldr2qtimezone.py for details.
"""
import os
-import sys
-import re
-import textwrap
-import enumdata
from localetools import Error
-from xpathlite import DraftResolution, findAlias, findEntry, findTagsInFile, codeMapsFromFile, \
- _findEntryInFile as findEntryInFile
-from dateconverter import convert_date
-from qlocalexml import Locale, QLocaleXmlWriter
-
-# TODO: make calendars a command-line option
-calendars = ['gregorian', 'persian', 'islamic'] # 'hebrew'
-def wrappedwarn(err, prefix, tokens):
- return err.write(
- '\n'.join(textwrap.wrap(prefix + ', '.join(tokens),
- subsequent_indent=' ', width=80)) + '\n')
-
-def parse_number_format(patterns, data):
- # this is a very limited parsing of the number format for currency only.
- def skip_repeating_pattern(x):
- p = x.replace('0', '#').replace(',', '').replace('.', '')
- seen = False
- result = ''
- for c in p:
- if c == '#':
- if seen:
- continue
- seen = True
- else:
- seen = False
- result = result + c
- return result
- patterns = patterns.split(';')
- result = []
- for pattern in patterns:
- pattern = skip_repeating_pattern(pattern)
- pattern = pattern.replace('#', "%1")
- # according to http://www.unicode.org/reports/tr35/#Number_Format_Patterns
- # there can be doubled or trippled currency sign, however none of the
- # locales use that.
- pattern = pattern.replace(u'\xa4', "%2")
- pattern = pattern.replace("''", "###").replace("'", '').replace("###", "'")
- pattern = pattern.replace('-', data['minus'])
- pattern = pattern.replace('+', data['plus'])
- result.append(pattern)
- return result
-
-cldr_dir = None
-def raiseUnknownCode(code, form, cache={}):
- """Check whether an unknown code could be supported.
-
- We declare a language, script or country code unknown if it's not
- known to enumdata.py; however, if it's present in main/en.xml's
- mapping of codes to names, we have the option of adding support.
- This caches the necessary look-up (so we only read main/en.xml
- once) and returns the name we should use if we do add support.
-
- First parameter, code, is the unknown code. Second parameter,
- form, is one of 'language', 'script' or 'country' to select the
- type of code to look up. Do not pass further parameters (the next
- will deprive you of the cache).
-
- Raises localetools.Error with a suitable message, that includes
- the unknown code's full name if found.
-
- Relies on global cldr_dir being set before it's called; see tail
- of this file.
- """
- if not cache:
- cache.update(codeMapsFromFile(os.path.join(cldr_dir, 'en.xml')))
- name = cache[form].get(code)
- msg = 'unknown %s code "%s"' % (form, code)
- if name:
- msg += ' - could use "%s"' % name
- raise Error(msg)
-
-def parse_list_pattern_part_format(pattern):
- # This is a very limited parsing of the format for list pattern part only.
- return pattern.replace("{0}", "%1").replace("{1}", "%2").replace("{2}", "%3")
-
-def unit_quantifiers(find, path, stem, suffix, known,
- # Stop at exa/exbi: 16 exbi = 2^{64} < zetta =
- # 1000^7 < zebi = 2^{70}, the next quantifiers up:
- si_quantifiers = ('kilo', 'mega', 'giga', 'tera', 'peta', 'exa')):
- """Work out the unit quantifiers.
-
- Unfortunately, the CLDR data only go up to terabytes and we want
- all the way to exabytes; but we can recognize the SI quantifiers
- as prefixes, strip and identify the tail as the localized
- translation for 'B' (e.g. French has 'octet' for 'byte' and uses
- ko, Mo, Go, To from which we can extrapolate Po, Eo).
-
- Should be called first for the SI quantifiers, with suffix = 'B',
- then for the IEC ones, with suffix = 'iB'; the list known
- (initially empty before first call) is used to let the second call
- know what the first learned about the localized unit.
- """
- if suffix == 'B': # first call, known = []
- tail = suffix
- for q in si_quantifiers:
- it = find(path, stem % q)
- # kB for kilobyte, in contrast with KiB for IEC:
- q = q[0] if q == 'kilo' else q[0].upper()
- if not it:
- it = q + tail
- elif it.startswith(q):
- rest = it[1:]
- tail = rest if all(rest == k for k in known) else suffix
- known.append(rest)
- yield it
- else: # second call, re-using first's known
- assert suffix == 'iB'
- if known:
- byte = known.pop()
- if all(byte == k for k in known):
- suffix = 'i' + byte
- for q in si_quantifiers:
- yield find(path, stem % q[:2],
- # Those don't (yet, v31) exist in CLDR, so we always fall back to:
- q[0].upper() + suffix)
-
-def generateLocaleInfo(path):
- if not path.endswith(".xml"):
- return {}
-
- # skip legacy/compatibility ones
- alias = findAlias(path)
- if alias:
- raise Error('Alias to "{}"'.format(alias))
-
- def code(tag):
- return findEntryInFile(path, 'identity/' + tag, attribute="type")[0]
-
- return _generateLocaleInfo(path, code('language'), code('script'),
- code('territory'), code('variant'))
-
-def getNumberSystems(cache={}):
- """Cached look-up of number system information.
-
- Pass no arguments. Returns a mapping from number system names to,
- for each system, a mapping with keys 'digits', 'type' and 'id'.
- Relies on global cldr_dir being set before it's first called.\n"""
- if not cache:
- for ns in findTagsInFile(os.path.join(cldr_dir, '..', 'supplemental',
- 'numberingSystems.xml'),
- 'numberingSystems'):
- # ns has form: [u'numberingSystem', [(u'digits', u'0123456789'), (u'type', u'numeric'), (u'id', u'latn')]]
- entry = dict(ns[1])
- name = entry[u'id']
- if u'digits' in entry and ord(entry[u'digits'][0]) > 0xffff:
- # FIXME, QTBUG-69324: make this redundant:
- # omit number system if zero doesn't fit in single-char16 UTF-16 :-(
- sys.stderr.write('skipping number system "%s" [can\'t represent its zero, U+%X]\n'
- % (name, ord(entry[u'digits'][0])))
- else:
- cache[name] = entry
- return cache
-
-def _generateLocaleInfo(path, language_code, script_code, country_code, variant_code=""):
- if not path.endswith(".xml"):
- return {}
-
- if language_code == 'root':
- # just skip it
- return {}
-
- # we do not support variants
- # ### actually there is only one locale with variant: en_US_POSIX
- # does anybody care about it at all?
- if variant_code:
- raise Error('We do not support variants ("{}")'.format(variant_code))
-
- language_id = enumdata.languageCodeToId(language_code)
- if language_id <= 0:
- raiseUnknownCode(language_code, 'language')
-
- script_id = enumdata.scriptCodeToId(script_code)
- if script_id == -1:
- raiseUnknownCode(script_code, 'script')
-
- # we should handle fully qualified names with the territory
- if not country_code:
- return {}
- country_id = enumdata.countryCodeToId(country_code)
- if country_id <= 0:
- raiseUnknownCode(country_code, 'country')
-
- # So we say we accept only those values that have "contributed" or
- # "approved" resolution. see http://www.unicode.org/cldr/process.html
- # But we only respect the resolution for new datas for backward
- # compatibility.
- draft = DraftResolution.contributed
-
- result = dict(
- language=enumdata.language_list[language_id][0],
- language_code=language_code, language_id=language_id,
- script=enumdata.script_list[script_id][0],
- script_code=script_code, script_id=script_id,
- country=enumdata.country_list[country_id][0],
- country_code=country_code, country_id=country_id,
- variant_code=variant_code)
-
- (dir_name, file_name) = os.path.split(path)
- def from_supplement(tag,
- path=os.path.join(dir_name, '..', 'supplemental',
- 'supplementalData.xml')):
- return findTagsInFile(path, tag)
- currencies = from_supplement('currencyData/region[iso3166=%s]' % country_code)
- result['currencyIsoCode'] = ''
- result['currencyDigits'] = 2
- result['currencyRounding'] = 1
- if currencies:
- for e in currencies:
- if e[0] == 'currency':
- t = [x[1] == 'false' for x in e[1] if x[0] == 'tender']
- if t and t[0]:
- pass
- elif not any(x[0] == 'to' for x in e[1]):
- result['currencyIsoCode'] = (x[1] for x in e[1] if x[0] == 'iso4217').next()
- break
- if result['currencyIsoCode']:
- t = from_supplement("currencyData/fractions/info[iso4217=%s]"
- % result['currencyIsoCode'])
- if t and t[0][0] == 'info':
- result['currencyDigits'] = (int(x[1]) for x in t[0][1] if x[0] == 'digits').next()
- result['currencyRounding'] = (int(x[1]) for x in t[0][1] if x[0] == 'rounding').next()
- numbering_system = None
- try:
- numbering_system = findEntry(path, "numbers/defaultNumberingSystem")
- except Error:
- pass
- def findEntryDef(path, xpath, value=''):
- try:
- return findEntry(path, xpath)
- except Error:
- return value
- def get_number_in_system(path, xpath, numbering_system):
- if numbering_system:
- try:
- return findEntry(path, xpath + "[numberSystem=" + numbering_system + "]")
- except Error:
- # in CLDR 1.9 number system was refactored for numbers (but not for currency)
- # so if previous findEntry doesn't work we should try this:
- try:
- return findEntry(path, xpath.replace("/symbols/", "/symbols[numberSystem=" + numbering_system + "]/"))
- except Error:
- # fallback to default
- pass
- return findEntry(path, xpath)
-
- result['decimal'] = get_number_in_system(path, "numbers/symbols/decimal", numbering_system)
- result['group'] = get_number_in_system(path, "numbers/symbols/group", numbering_system)
- assert result['decimal'] != result['group']
- result['list'] = get_number_in_system(path, "numbers/symbols/list", numbering_system)
- result['percent'] = get_number_in_system(path, "numbers/symbols/percentSign", numbering_system)
- try:
- result['zero'] = getNumberSystems()[numbering_system][u"digits"][0]
- except Exception as e:
- sys.stderr.write("Native zero detection problem: %s\n" % repr(e))
- result['zero'] = get_number_in_system(path, "numbers/symbols/nativeZeroDigit", numbering_system)
- result['minus'] = get_number_in_system(path, "numbers/symbols/minusSign", numbering_system)
- result['plus'] = get_number_in_system(path, "numbers/symbols/plusSign", numbering_system)
- result['exp'] = get_number_in_system(path, "numbers/symbols/exponential", numbering_system).lower()
- result['quotationStart'] = findEntry(path, "delimiters/quotationStart")
- result['quotationEnd'] = findEntry(path, "delimiters/quotationEnd")
- result['alternateQuotationStart'] = findEntry(path, "delimiters/alternateQuotationStart")
- result['alternateQuotationEnd'] = findEntry(path, "delimiters/alternateQuotationEnd")
- result['listPatternPartStart'] = parse_list_pattern_part_format(findEntry(path, "listPatterns/listPattern/listPatternPart[start]"))
- result['listPatternPartMiddle'] = parse_list_pattern_part_format(findEntry(path, "listPatterns/listPattern/listPatternPart[middle]"))
- result['listPatternPartEnd'] = parse_list_pattern_part_format(findEntry(path, "listPatterns/listPattern/listPatternPart[end]"))
- result['listPatternPartTwo'] = parse_list_pattern_part_format(findEntry(path, "listPatterns/listPattern/listPatternPart[2]"))
- result['am'] = findEntry(path, "dates/calendars/calendar[gregorian]/dayPeriods/dayPeriodContext[format]/dayPeriodWidth[wide]/dayPeriod[am]", draft)
- result['pm'] = findEntry(path, "dates/calendars/calendar[gregorian]/dayPeriods/dayPeriodContext[format]/dayPeriodWidth[wide]/dayPeriod[pm]", draft)
- result['longDateFormat'] = convert_date(findEntry(path, "dates/calendars/calendar[gregorian]/dateFormats/dateFormatLength[full]/dateFormat/pattern"))
- result['shortDateFormat'] = convert_date(findEntry(path, "dates/calendars/calendar[gregorian]/dateFormats/dateFormatLength[short]/dateFormat/pattern"))
- result['longTimeFormat'] = convert_date(findEntry(path, "dates/calendars/calendar[gregorian]/timeFormats/timeFormatLength[full]/timeFormat/pattern"))
- result['shortTimeFormat'] = convert_date(findEntry(path, "dates/calendars/calendar[gregorian]/timeFormats/timeFormatLength[short]/timeFormat/pattern"))
-
- endonym = None
- if country_code and script_code:
- endonym = findEntryDef(path, "localeDisplayNames/languages/language[type=%s_%s_%s]" % (language_code, script_code, country_code))
- if not endonym and script_code:
- endonym = findEntryDef(path, "localeDisplayNames/languages/language[type=%s_%s]" % (language_code, script_code))
- if not endonym and country_code:
- endonym = findEntryDef(path, "localeDisplayNames/languages/language[type=%s_%s]" % (language_code, country_code))
- if not endonym:
- endonym = findEntryDef(path, "localeDisplayNames/languages/language[type=%s]" % (language_code))
- result['languageEndonym'] = endonym
- result['countryEndonym'] = findEntryDef(path, "localeDisplayNames/territories/territory[type=%s]" % (country_code))
-
- currency_format = get_number_in_system(path, "numbers/currencyFormats/currencyFormatLength/currencyFormat/pattern", numbering_system)
- currency_format = parse_number_format(currency_format, result)
- result['currencyFormat'] = currency_format[0]
- result['currencyNegativeFormat'] = ''
- if len(currency_format) > 1:
- result['currencyNegativeFormat'] = currency_format[1]
-
- result['currencySymbol'] = ''
- result['currencyDisplayName'] = ''
- if result['currencyIsoCode']:
- result['currencySymbol'] = findEntryDef(path, "numbers/currencies/currency[%s]/symbol" % result['currencyIsoCode'])
- result['currencyDisplayName'] = ';'.join(
- findEntryDef(path, 'numbers/currencies/currency[' + result['currencyIsoCode']
- + ']/displayName' + tail)
- for tail in ['',] + [
- '[count=%s]' % x for x in ('zero', 'one', 'two', 'few', 'many', 'other')
- ]) + ';'
-
- def findUnitDef(path, stem, fallback=''):
- # The displayName for a quantified unit in en.xml is kByte
- # instead of kB (etc.), so prefer any unitPattern provided:
- for count in ('many', 'few', 'two', 'other', 'zero', 'one'):
- try:
- ans = findEntry(path, stem + 'unitPattern[count=%s]' % count)
- except Error:
- continue
-
- # TODO: epxloit count-handling, instead of discarding placeholders
- if ans.startswith('{0}'):
- ans = ans[3:].lstrip()
- if ans:
- return ans
-
- return findEntryDef(path, stem + 'displayName', fallback)
-
- # First without quantifier, then quantified each way:
- result['byte_unit'] = findEntryDef(
- path, 'units/unitLength[type=long]/unit[type=digital-byte]/displayName',
- 'bytes')
- stem = 'units/unitLength[type=short]/unit[type=digital-%sbyte]/'
- known = [] # cases where we *do* have a given version:
- result['byte_si_quantified'] = ';'.join(unit_quantifiers(findUnitDef, path, stem, 'B', known))
- # IEC 60027-2
- # http://physics.nist.gov/cuu/Units/binary.html
- result['byte_iec_quantified'] = ';'.join(unit_quantifiers(findUnitDef, path, stem % '%sbi', 'iB', known))
-
- # Used for month and day data:
- namings = (
- ('standaloneLong', 'stand-alone', 'wide'),
- ('standaloneShort', 'stand-alone', 'abbreviated'),
- ('standaloneNarrow', 'stand-alone', 'narrow'),
- ('long', 'format', 'wide'),
- ('short', 'format', 'abbreviated'),
- ('narrow', 'format', 'narrow'),
- )
-
- # Month names for 12-month calendars:
- for cal in calendars:
- stem = 'dates/calendars/calendar[' + cal + ']/months/'
- for (key, mode, size) in namings:
- prop = 'monthContext[' + mode + ']/monthWidth[' + size + ']/'
- result[key + 'Months_' + cal] = ';'.join(
- findEntry(path, stem + prop + "month[%d]" % i)
- for i in range(1, 13)) + ';'
-
- # Day data (for Gregorian, at least):
- stem = 'dates/calendars/calendar[gregorian]/days/'
- days = ('sun', 'mon', 'tue', 'wed', 'thu', 'fri', 'sat')
- for (key, mode, size) in namings:
- prop = 'dayContext[' + mode + ']/dayWidth[' + size + ']/day'
- result[key + 'Days'] = ';'.join(
- findEntry(path, stem + prop + '[' + day + ']')
- for day in days) + ';'
-
- return Locale(result)
-
-def integrateWeekData(filePath, locale_database):
- if not filePath.endswith(".xml"):
- return {}
-
- def lookup(key):
- return findEntryInFile(filePath, key, attribute='territories')[0].split()
- days = ('mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun')
-
- firstDayByCountryCode = {}
- for day in days:
- for countryCode in lookup('weekData/firstDay[day=%s]' % day):
- firstDayByCountryCode[countryCode] = day
-
- weekendStartByCountryCode = {}
- for day in days:
- for countryCode in lookup('weekData/weekendStart[day=%s]' % day):
- weekendStartByCountryCode[countryCode] = day
-
- weekendEndByCountryCode = {}
- for day in days:
- for countryCode in lookup('weekData/weekendEnd[day=%s]' % day):
- weekendEndByCountryCode[countryCode] = day
-
- for (key, locale) in locale_database.iteritems():
- countryCode = locale.country_code
- if countryCode in firstDayByCountryCode:
- locale.firstDayOfWeek = firstDayByCountryCode[countryCode]
- else:
- locale.firstDayOfWeek = firstDayByCountryCode["001"]
-
- if countryCode in weekendStartByCountryCode:
- locale.weekendStart = weekendStartByCountryCode[countryCode]
- else:
- locale.weekendStart = weekendStartByCountryCode["001"]
-
- if countryCode in weekendEndByCountryCode:
- locale.weekendEnd = weekendEndByCountryCode[countryCode]
- else:
- locale.weekendEnd = weekendEndByCountryCode["001"]
-
-def splitLocale(name):
- """Split name into (language, script, territory) triple as generator.
-
- Ignores any trailing fields (with a warning), leaves script (a capitalised
- four-letter token) or territory (either a number or an all-uppercase token)
- empty if unspecified, returns a single-entry generator if name is a single
- tag (i.e. contains no underscores). Always yields 1 or 3 values, never 2."""
- tags = iter(name.split('_'))
- yield tags.next() # Language
- tag = tags.next()
-
- # Script is always four letters, always capitalised:
- if len(tag) == 4 and tag[0].isupper() and tag[1:].islower():
- yield tag
- try:
- tag = tags.next()
- except StopIteration:
- tag = ''
- else:
- yield ''
-
- # Territory is upper-case or numeric:
- if tag and tag.isupper() or tag.isdigit():
- yield tag
- tag = ''
- else:
- yield ''
-
- # If nothing is left, StopIteration will avoid the warning:
- tag = (tag if tag else tags.next(),)
- sys.stderr.write('Ignoring unparsed cruft %s in %s\n' % ('_'.join(tag + tuple(tags)), name))
-
-def _parseLocale(l):
- language = "AnyLanguage"
- script = "AnyScript"
- country = "AnyCountry"
-
- if l == "und":
- raise Error('We treat unknown locale like C')
-
- parsed = splitLocale(l)
- language_code = parsed.next()
- script_code = country_code = ''
- try:
- script_code, country_code = parsed
- except ValueError:
- pass
-
- if language_code != "und":
- language_id = enumdata.languageCodeToId(language_code)
- if language_id == -1:
- raise Error('Unknown language code "{}"'.format(language_code))
- language = enumdata.language_list[language_id][0]
-
- if script_code:
- script_id = enumdata.scriptCodeToId(script_code)
- if script_id == -1:
- raise Error('Unknown script code "{}"'.format(script_code))
- script = enumdata.script_list[script_id][0]
-
- if country_code:
- country_id = enumdata.countryCodeToId(country_code)
- if country_id == -1:
- raise Error('Unknown country code "{}"'.format(country_code))
- country = enumdata.country_list[country_id][0]
-
- return (language, script, country)
-
-def likelySubtags(root, err):
- skips = []
- for ns in findTagsInFile(os.path.join(root, 'supplemental', 'likelySubtags.xml'), "likelySubtags"):
- tmp = {}
- for data in ns[1:][0]: # ns looks like this: [u'likelySubtag', [(u'from', u'aa'), (u'to', u'aa_Latn_ET')]]
- tmp[data[0]] = data[1]
-
- try:
- from_language, from_script, from_country = _parseLocale(tmp[u"from"])
- to_language, to_script, to_country = _parseLocale(tmp[u"to"])
- except Error as e:
- if (tmp['to'].startswith(tmp['from'])
- and e.message == 'Unknown language code "{}"'.format(tmp['from'])):
- skips.append(tmp['to'])
- else:
- sys.stderr.write('skipping likelySubtag "{}" -> "{}" ({})\n'.format(
- tmp[u"from"], tmp[u"to"], e.message))
- continue
- # substitute according to http://www.unicode.org/reports/tr35/#Likely_Subtags
- if to_country == "AnyCountry" and from_country != to_country:
- to_country = from_country
- if to_script == "AnyScript" and from_script != to_script:
- to_script = from_script
-
- yield ((from_language, from_script, from_country),
- (to_language, to_script, to_country))
- if skips:
- wrappedwarn(err, 'skipping likelySubtags (for unknown language codes): ', skips)
+from cldr import CldrReader
+from qlocalexml import QLocaleXmlWriter
+from enumdata import language_list, script_list, country_list
def usage(err, name, message = ''):
- err.write("""Usage: {} <path-to-cldr-main> [out-file.xml]
-""".format(name)) # TODO: expand
+ err.write("""Usage: {} path/to/cldr/common/main [out-file.xml]
+""".format(name)) # TODO: expand command-line, improve help message
if message:
err.write('\n' + message + '\n')
def main(args, out, err):
- name = args.pop(0)
+ # TODO: make calendars a command-line option
+ calendars = ['gregorian', 'persian', 'islamic'] # 'hebrew'
- if len(args) < 1:
- usage(err, name)
+ # TODO: make argument parsing more sophisticated
+ name = args.pop(0)
+ if not args:
+ usage(name, err, 'Where is your CLDR data tree ?')
return 1
- global cldr_dir
- cldr_dir = args.pop(0)
- if not os.path.isdir(cldr_dir):
- usage(err, name, 'Where did you unpack the CLDR data files ?')
+ root = args.pop(0)
+ if not os.path.exists(os.path.join(root, 'common', 'main', 'root.xml')):
+ usage(name, err,
+ 'First argument is the root of the CLDR tree: found no common/main/root.xml under '
+ + root)
return 1
- if len(args) > 1:
- usage(err, name, 'Too many arguments passed')
+ xml = args.pop(0) if args else None
+ if not xml or xml == '-':
+ emit = out
+ elif not xml.endswith('.xml'):
+ usage(name, err, 'Please use a .xml extension on your output file name, not ' + xml)
return 1
- if args:
- qxml = open(args.pop(0), 'w')
else:
- qxml = out
-
- getNumberSystems(cldr_dir)
- cldr_files = os.listdir(cldr_dir)
- locale_database = {}
-
- # see http://www.unicode.org/reports/tr35/tr35-info.html#Default_Content
- defaultContent_locales = []
- for ns in findTagsInFile(os.path.join(cldr_dir, '..', 'supplemental',
- 'supplementalMetadata.xml'),
- 'metadata/defaultContent'):
- for data in ns[1:][0]:
- if data[0] == u"locales":
- defaultContent_locales += data[1].split()
-
- skips = []
- for file in defaultContent_locales:
try:
- language_code, script_code, country_code = splitLocale(file)
- except ValueError:
- sys.stderr.write('skipping defaultContent locale "' + file + '" [neither two nor three tags]\n')
- continue
+ emit = open(xml, 'w')
+ except IOError as e:
+ usage(name, err, 'Failed to open "{}" to write output to it\n'.format(xml))
+ return 1
- if not (script_code or country_code):
- sys.stderr.write('skipping defaultContent locale "' + file + '" [second tag is neither script nor territory]\n')
- continue
-
- try:
- l = _generateLocaleInfo(cldr_dir + "/" + file + ".xml", language_code, script_code, country_code)
- if not l:
- skips.append(file)
- continue
- except Error as e:
- sys.stderr.write('skipping defaultContent locale "{}" ({})\n'.format(file, e.message))
- continue
-
- locale_database[(l.language_id, l.script_id, l.country_id, l.variant_code)] = l
-
- if skips:
- wrappedwarn(err, 'skipping defaultContent locales [no locale info generated]: ', skips)
- skips = []
-
- for file in cldr_files:
- try:
- l = generateLocaleInfo(cldr_dir + "/" + file)
- if not l:
- skips.append(file)
- continue
- except Error as e:
- sys.stderr.write('skipping file "{}" ({})\n'.format(file, e.message))
- continue
-
- locale_database[(l.language_id, l.script_id, l.country_id, l.variant_code)] = l
-
- if skips:
- wrappedwarn(err, 'skipping files [no locale info generated]: ', skips)
+ if args:
+ usage(name, err, 'Too many arguments - excess: ' + ' '.join(args))
+ return 1
- integrateWeekData(cldr_dir + "/../supplemental/supplementalData.xml", locale_database)
- cldr_version = 'unknown'
- with open(cldr_dir+"/../dtd/ldml.dtd", "r") as ldml:
- for line in ldml:
- if 'version cldrVersion CDATA #FIXED' in line:
- cldr_version = line.split('"')[1]
+ # TODO - command line options to tune choice of grumble and whitter:
+ reader = CldrReader(root, err.write, err.write)
+ writer = QLocaleXmlWriter(emit.write)
- xmlOut = QLocaleXmlWriter(qxml.write)
- xmlOut.version(cldr_version)
- xmlOut.enumData(enumdata.language_list,
- enumdata.script_list,
- enumdata.country_list)
- xmlOut.likelySubTags(likelySubtags(os.path.split(cldr_dir)[0], err))
- xmlOut.locales(locale_database, calendars)
- xmlOut.close()
- if qxml is not out:
- qxml.close()
+ writer.version(reader.root.cldrVersion)
+ writer.enumData(language_list, script_list, country_list)
+ writer.likelySubTags(reader.likelySubTags())
+ writer.locales(reader.readLocales(calendars), calendars)
+ writer.close()
return 0
if __name__ == '__main__':
diff --git a/util/locale_database/ldml.py b/util/locale_database/ldml.py
index 4aaa728a86..ff94f3da73 100644
--- a/util/locale_database/ldml.py
+++ b/util/locale_database/ldml.py
@@ -39,10 +39,12 @@ returned by minidom.parse() and their child-nodes:
Node -- wraps any node in the DOM tree
XmlScanner -- wraps the root element of a stand-alone XML file
Supplement -- specializes XmlScanner for supplemental data files
+ LocaleScanner -- wraps a locale's inheritance-chain of file roots
See individual classes for further detail.
"""
from localetools import Error
+from dateconverter import convert_date
class Node (object):
"""Wrapper for an arbitrary DOM node.
@@ -51,11 +53,20 @@ class Node (object):
nodes are returned wrapped as Node objects. A Node exposes the
raw DOM node it wraps via its .dom attribute."""
- def __init__(self, elt):
+ def __init__(self, elt, draft = 0):
"""Wraps a DOM node for ease of access.
- Single argument, elt, is the DOM node to wrap."""
+ First argument, elt, is the DOM node to wrap. (Optional second
+ argument, draft, should only be supplied by this class's
+ creation of child nodes; it is the maximum draft score of any
+ ancestor of the new node.)"""
self.dom = elt
+ try:
+ attr = elt.attributes['draft'].nodeValue
+ except KeyError:
+ self.draft = draft
+ else:
+ self.draft = max(draft, self.draftScore(attr))
def findAllChildren(self, tag, wanted = None):
"""All children that do have the given tag and attributes.
@@ -65,34 +76,60 @@ class Node (object):
Optional second argument, wanted, should either be None or map
attribute names to the values they must have. Only child nodes
- with these attributes set to the given values are yielded."""
+ with thes attributes set to the given values are yielded."""
- cutoff = 4 # Only accept approved, for now
for child in self.dom.childNodes:
if child.nodeType != child.ELEMENT_NODE:
continue
if child.nodeName != tag:
continue
- try:
- draft = child.attributes['draft']
- except KeyError:
- pass
- else:
- if self.__draftScores.get(draft, 0) < cutoff:
- continue
-
- if wanted is not None:
+ if wanted:
try:
- if wanted and any(child.attributes[k].nodeValue != v for k, v in wanted.items()):
+ if any(child.attributes[k].nodeValue != v
+ for k, v in wanted.items()):
continue
except KeyError: # Some wanted attribute is missing
continue
- yield Node(child)
+ yield Node(child, self.draft)
+
+ def findUniqueChild(self, tag):
+ """Returns the single child with the given nodeName.
+
+ Raises Error if there is no such child or there is more than
+ one."""
+ seq = self.findAllChildren(tag)
+ try:
+ node = seq.next()
+ except StopIteration:
+ raise Error('No child found where one was expected', tag)
+ for it in seq:
+ raise Error('Many children found where only one was expected', tag)
+ return node
+
+ @classmethod
+ def draftScore(cls, level):
+ """Maps draft level names to numeric scores.
+
+ Single parameter, level, is the least sure value of the draft
+ attribute on a node that you're willing to accept; returns a
+ numeric value (lower is less drafty).
- __draftScores = dict(true = 0, unconfirmed = 1, provisional = 2,
- contributed = 3, approved = 4, false = 4)
+ Tempting as it is to insist on low draft scores, there are
+ many locales in which pretty much every leaf is
+ unconfirmed. It may make sense to actually check each
+ XmlScanner object, or each node in each LocaleScanner's nodes
+ list, to see what its distribution of draft level looks like,
+ so as to set the acceptable draft score for its elements
+ accordingly. However, for the moment, we mostly just accept
+ all elements, regardless of draft values (the one exception is
+ am/pm indicators)."""
+ return cls.__draftScores.get(level, 5) if level else 0
+
+ # Implementation details:
+ __draftScores = dict(true = 4, unconfirmed = 3, provisional = 2,
+ contributed = 1, approved = 0, false = 0)
def _parseXPath(selector):
# Split "tag[attr=val][...]" into tag-name and attribute mapping
@@ -129,7 +166,6 @@ class XmlScanner (object):
return elts
class Supplement (XmlScanner):
- # Replaces xpathlite.findTagsInFile()
def find(self, xpath):
elts = self.findNodes(xpath)
for elt in _iterateEach(e.dom.childNodes if e.dom.childNodes else (e.dom,)
@@ -138,3 +174,381 @@ class Supplement (XmlScanner):
yield (elt.nodeName,
dict((k, v if isinstance(v, basestring) else v.nodeValue)
for k, v in elt.attributes.items()))
+
+class LocaleScanner (object):
+ def __init__(self, name, nodes, root):
+ self.name, self.nodes, self.base = name, nodes, root
+
+ def find(self, xpath, draft = None):
+ tags = xpath.split('/')
+ while True:
+ replace = None
+ for elt in self.nodes:
+ for selector in tags:
+ tag, attrs = _parseXPath(selector)
+ for elt in elt.findAllChildren(tag, attrs):
+ if draft is None or elt.draft <= draft:
+ break # and process the next selector
+ else:
+ break # no child, try next elt in self.nodes
+ else:
+ # processed all selectors
+ try:
+ return elt.dom.firstChild.nodeValue
+ except (AttributeError, KeyError):
+ pass # move on to next elt in self.nodes
+
+ # No match in self.nodes; check root
+ elt = self.base.root
+ for i, selector in enumerate(tags):
+ tag, attrs = _parseXPath(selector)
+ for alias in elt.findAllChildren('alias'):
+ if alias.dom.attributes['source'].nodeValue == 'locale':
+ replace = alias.dom.attributes['path'].nodeValue.split('/')
+ tags = self.__xpathJoin(tags[:i], replace, tags[i:])
+ break
+ else:
+ for elt in elt.findAllChildren(tag, attrs):
+ if draft is None or elt.draft <= draft:
+ break # and process the next selector
+ else:
+ break
+ if replace:
+ break
+ else:
+ # processed all selectors
+ try:
+ return elt.dom.firstChild.nodeValue
+ except (AttributeError, KeyError):
+ # No match
+ pass
+ if not replace:
+ break
+
+ sought = '/'.join(tags)
+ if sought != xpath:
+ sought += ' (for {})'.format(xpath)
+ raise Error('No {} in {}'.format(sought, self.name))
+
+ def findOr(self, xpath, fallback = ''):
+ """Use a fall-back value if we don't find data.
+
+ Like find, but takes a fall-back value to return instead of
+ raising Error on failure."""
+ try:
+ return self.find(xpath)
+ except Error:
+ return fallback
+
+ def tagCodes(self):
+ """Yields four tag codes
+
+ The tag codes are language, script, country and variant; an
+ empty value for any of them indicates that no value was
+ provided. The values are obtained from the primary file's
+ top-level <identity> element. An Error is raised if any
+ top-level <alias> element of this file has a non-empty source
+ attribute; that attribute value is mentioned in the error's
+ message."""
+ root = self.nodes[0]
+ for alias in root.findAllChildren('alias'):
+ try:
+ source = alias.dom.attributes['source'].nodeValue
+ except (KeyError, AttributeError):
+ pass
+ else:
+ raise Error('Alias to {}'.format(source))
+
+ ids = root.findUniqueChild('identity')
+ for code in ('language', 'script', 'territory', 'variant'):
+ for node in ids.findAllChildren(code):
+ try:
+ yield node.dom.attributes['type'].nodeValue
+ except (KeyError, AttributeError):
+ pass
+ else:
+ break # only want one value for each code
+ else: # No value for this code, use empty
+ yield ''
+
+ def currencyData(self, isoCode):
+ """Fetches currency data for this locale.
+
+ Single argument, isoCode, is the ISO currency code for the
+ currency in use in the country. See also numericData, which
+ includes some currency formats.
+ """
+ if isoCode:
+ stem = 'numbers/currencies/currency[{}]/'.format(isoCode)
+ symbol = self.findOr(stem + 'symbol')
+ name = ';'.join(
+ self.findOr(stem + 'displayName' + tail)
+ for tail in ('',) + tuple(
+ '[count={}]'.format(x) for x in ('zero', 'one', 'two', 'few', 'many', 'other')
+ )) + ';'
+ else:
+ symbol = name = ''
+ yield 'currencySymbol', symbol
+ yield 'currencyDisplayName', name
+
+ def numericData(self, lookup, complain = lambda text: None):
+ """Generate assorted numeric data for the locale.
+
+ First argument, lookup, is a callable that maps a numbering
+ system's name to certain data about the system, as a mapping;
+ we expect this to have u'digits' as a key.
+ """
+ system = self.find('numbers/defaultNumberingSystem')
+ stem = 'numbers/symbols[numberSystem={}]/'.format(system)
+ decimal = self.find(stem + 'decimal')
+ group = self.find(stem + 'group')
+ assert decimal != group, (self.name, system, decimal)
+ yield 'decimal', decimal
+ yield 'group', group
+ yield 'percent', self.find(stem + 'percentSign')
+ yield 'list', self.find(stem + 'list')
+ # FIXME: don't lower-case:
+ yield 'exp', self.find(stem + 'exponential').lower()
+
+ digits = lookup(system)['digits']
+ assert len(digits) == 10
+ zero = digits[0]
+ # Qt's number-formatting code assumes digits are consecutive:
+ assert all(ord(c) == i for i, c in enumerate(digits, ord(zero)))
+ yield 'zero', zero
+
+ plus = self.find(stem + 'plusSign')
+ minus = self.find(stem + 'minusSign')
+ yield 'plus', plus
+ yield 'minus', minus
+
+ # Currency formatting (currencyFormat may have a type field):
+ money = self.find('numbers/currencyFormats/currencyFormatLength/currencyFormat/pattern')
+ money = self.__currencyFormats(money, plus, minus)
+ yield 'currencyFormat', money.next()
+ neg = ''
+ for it in money:
+ assert not neg, 'There should be at most one more pattern'
+ neg = it
+ yield 'currencyNegativeFormat', neg
+
+ def textPatternData(self):
+ for key in ('quotationStart', 'alternateQuotationEnd',
+ 'quotationEnd', 'alternateQuotationStart'):
+ yield key, self.find('delimiters/' + key)
+
+ for key in ('start', 'middle', 'end'):
+ yield ('listPatternPart' + key.capitalize(),
+ self.__fromLdmlListPattern(self.find(
+ 'listPatterns/listPattern/listPatternPart[{}]'.format(key))))
+ yield ('listPatternPartTwo',
+ self.__fromLdmlListPattern(self.find(
+ 'listPatterns/listPattern/listPatternPart[2]')))
+
+ stem = 'dates/calendars/calendar[gregorian]/'
+ # TODO: is wide really the right width to use here ?
+ # abbreviated might be an option ... or try both ?
+ meridiem = stem + 'dayPeriods/dayPeriodContext[format]/dayPeriodWidth[wide]/'
+ for key in ('am', 'pm'):
+ yield key, self.find(meridiem + 'dayPeriod[{}]'.format(key),
+ draft = Node.draftScore('contributed'))
+
+ for pair in (('long', 'full'), ('short', 'short')):
+ for key in ('time', 'date'):
+ yield (pair[0] + key.capitalize() + 'Format',
+ convert_date(self.find(
+ stem + '{}Formats/{}FormatLength[{}]/{}Format/pattern'.format(
+ key, key, pair[1], key))))
+
+ def endonyms(self, language, script, country, variant):
+ # TODO: take variant into account ?
+ for seq in ((language, script, country),
+ (language, script), (language, country), (language,)):
+ if not all(seq):
+ continue
+ try:
+ yield ('languageEndonym',
+ self.find('localeDisplayNames/languages/language[{}]'
+ .format('_'.join(seq))))
+ except Error:
+ pass
+ else:
+ break
+ else:
+ # grumble(failed to find endonym for language)
+ yield 'languageEndonym', ''
+
+ yield ('countryEndonym',
+ self.findOr('localeDisplayNames/territories/territory[{}]'
+ .format(country)))
+
+ def unitData(self):
+ yield ('byte_unit',
+ self.findOr('units/unitLength[long]/unit[digital-byte]/displayName',
+ 'bytes'))
+
+ unit = self.__findUnit('', 'B')
+ cache = [] # Populated by the SI call, to give hints to the IEC call
+ yield ('byte_si_quantified',
+ ';'.join(self.__unitCount('', unit, cache)))
+ # IEC 60027-2
+ # http://physics.nist.gov/cuu/Units/binary.html
+ yield ('byte_iec_quantified',
+ ';'.join(self.__unitCount('bi', 'iB', cache)))
+
+ def calendarNames(self, calendars):
+ namings = self.__nameForms
+ for cal in calendars:
+ stem = 'dates/calendars/calendar[' + cal + ']/months/'
+ for key, mode, size in namings:
+ prop = 'monthContext[' + mode + ']/monthWidth[' + size + ']/'
+ yield (key + 'Months_' + cal,
+ ';'.join(self.find(stem + prop + 'month[{}]'.format(i))
+ for i in range(1, 13)) + ';')
+
+ # Day data (for Gregorian, at least):
+ stem = 'dates/calendars/calendar[gregorian]/days/'
+ days = ('sun', 'mon', 'tue', 'wed', 'thu', 'fri', 'sat')
+ for (key, mode, size) in namings:
+ prop = 'dayContext[' + mode + ']/dayWidth[' + size + ']/day'
+ yield (key + 'Days',
+ ';'.join(self.find(stem + prop + '[' + day + ']')
+ for day in days) + ';')
+
+ # Implementation details
+ __nameForms = (
+ ('standaloneLong', 'stand-alone', 'wide'),
+ ('standaloneShort', 'stand-alone', 'abbreviated'),
+ ('standaloneNarrow', 'stand-alone', 'narrow'),
+ ('long', 'format', 'wide'),
+ ('short', 'format', 'abbreviated'),
+ ('narrow', 'format', 'narrow'),
+ ) # Used for month and day names
+
+ def __findUnit(self, keySuffix, quantify, fallback=''):
+ # The displayName for a quantified unit in en.xml is kByte
+ # (even for unitLength[narrow]) instead of kB (etc.), so
+ # prefer any unitPattern provided, but prune its placeholder:
+ for size in ('short', 'narrow'): # TODO: reverse order ?
+ stem = 'units/unitLength[{}]/unit[digital-{}byte]/'.format(size + keySuffix, quantify)
+ for count in ('many', 'few', 'two', 'other', 'zero', 'one'):
+ try:
+ ans = self.find(stem + 'unitPattern[count={}]'.format(count))
+ except Error:
+ continue
+
+ # TODO: do count-handling, instead of discarding placeholders
+ if False: # TODO: do it this way, instead !
+ ans = ans.replace('{0}', '').strip()
+ elif ans.startswith('{0}'):
+ ans = ans[3:].lstrip()
+ if ans:
+ return ans
+
+ try:
+ return self.find(stem + 'displayName')
+ except Error:
+ pass
+
+ return fallback
+
+ def __unitCount(self, keySuffix, suffix, cache,
+ # Stop at exa/exbi: 16 exbi = 2^{64} < zetta =
+ # 1000^7 < zebi = 2^{70}, the next quantifiers up:
+ siQuantifiers = ('kilo', 'mega', 'giga', 'tera', 'peta', 'exa')):
+ """Work out the unit quantifiers.
+
+ Unfortunately, the CLDR data only go up to terabytes and we
+ want all the way to exabytes; but we can recognize the SI
+ quantifiers as prefixes, strip and identify the tail as the
+ localized translation for 'B' (e.g. French has 'octet' for
+ 'byte' and uses ko, Mo, Go, To from which we can extrapolate
+ Po, Eo).
+
+ Should be called first for the SI quantifiers, with suffix =
+ 'B', then for the IEC ones, with suffix = 'iB'; the list cache
+ (initially empty before first call) is used to let the second
+ call know what the first learned about the localized unit.
+ """
+ if suffix == 'iB': # second call, re-using first's cache
+ if cache:
+ byte = cache.pop()
+ if all(byte == k for k in cache):
+ suffix = 'i' + byte
+ for q in siQuantifiers:
+ # Those don't (yet, v36) exist in CLDR, so we always get the fall-back:
+ yield self.__findUnit(keySuffix, q[:2], q[0].upper() + suffix)
+ else: # first call
+ tail = suffix = suffix or 'B'
+ for q in siQuantifiers:
+ it = self.__findUnit(keySuffix, q)
+ # kB for kilobyte, in contrast with KiB for IEC:
+ q = q[0] if q == 'kilo' else q[0].upper()
+ if not it:
+ it = q + tail
+ elif it.startswith(q):
+ rest = it[1:]
+ tail = rest if all(rest == k for k in cache) else suffix
+ cache.append(rest)
+ yield it
+
+ @staticmethod
+ def __currencyFormats(patterns, plus, minus):
+ for p in patterns.split(';'):
+ p = p.replace('0', '#').replace(',', '').replace('.', '')
+ try:
+ cut = p.find('#') + 1
+ except ValueError:
+ pass
+ else:
+ p = p[:cut] + p[cut:].replace('#', '')
+ p = p.replace('#', "%1")
+ # According to http://www.unicode.org/reports/tr35/#Number_Format_Patterns
+ # there can be doubled or trippled currency sign, however none of the
+ # locales use that.
+ p = p.replace(u'\xa4', "%2")
+ # Single quote goes away, but double goes to single:
+ p = p.replace("''", '###').replace("'", '').replace('###', "'")
+ # Use number system's signs:
+ p = p.replace('+', plus).replace('-', minus)
+ yield p
+
+ @staticmethod
+ def __fromLdmlListPattern(pattern):
+ # This is a very limited parsing of the format for list pattern part only.
+ return pattern.replace('{0}', '%1').replace('{1}', '%2').replace('{2}', '%3')
+
+ @staticmethod
+ def __fromLdmlPath(seq): # tool function for __xpathJoin()
+ """Convert LDML's [@name='value'] to our [name=value] form."""
+ for it in seq:
+ # First dismember it:
+ attrs = it.split('[')
+ tag = attrs.pop(0)
+ if not attrs: # Short-cut the easy case:
+ yield it
+ continue
+
+ assert all(x.endswith(']') for x in attrs)
+ attrs = [x[:-1].split('=') for x in attrs]
+ # Then fix each attribute specification in it:
+ attrs = [(x[0][1:] if x[0].startswith('@') else x[0],
+ x[1][1:-1] if x[1].startswith("'") and x[1].endswith("'") else x[1])
+ for x in attrs]
+ # Finally, put it all back together:
+ attrs = ['='.join(x) + ']' for x in attrs]
+ attrs.insert(0, tag)
+ yield '['.join(attrs)
+
+ @classmethod
+ def __xpathJoin(cls, head, insert, tail):
+ """Join three lists of XPath selectors.
+
+ Each of head, insert and tail is a sequence of selectors but
+ insert may start with some uses of '..', that we want to
+ resolve away, and may use LDML's attribute format, that we
+ want to convert to our format."""
+ while insert and insert[0] == '..':
+ insert.pop(0)
+ head.pop()
+ return head + list(cls.__fromLdmlPath(insert)) + tail
diff --git a/util/locale_database/qlocalexml2cpp.py b/util/locale_database/qlocalexml2cpp.py
index 59161ed9d0..1938be19ea 100755
--- a/util/locale_database/qlocalexml2cpp.py
+++ b/util/locale_database/qlocalexml2cpp.py
@@ -480,7 +480,7 @@ def main(args, out, err):
return 1
reader = QLocaleXmlReader(qlocalexml)
- locale_map = dict(reader.loadLocaleMap(calendars, sys.stderr.write))
+ locale_map = dict(reader.loadLocaleMap(calendars, err.write))
locale_keys = locale_map.keys()
compareLocaleKeys.default_map = dict(reader.defaultMap())
diff --git a/util/locale_database/xpathlite.py b/util/locale_database/xpathlite.py
deleted file mode 100644
index 3da8b24656..0000000000
--- a/util/locale_database/xpathlite.py
+++ /dev/null
@@ -1,284 +0,0 @@
-#!/usr/bin/env python
-#############################################################################
-##
-## Copyright (C) 2016 The Qt Company Ltd.
-## Contact: https://www.qt.io/licensing/
-##
-## This file is part of the test suite of the Qt Toolkit.
-##
-## $QT_BEGIN_LICENSE:GPL-EXCEPT$
-## Commercial License Usage
-## Licensees holding valid commercial Qt licenses may use this file in
-## accordance with the commercial license agreement provided with the
-## Software or, alternatively, in accordance with the terms contained in
-## a written agreement between you and The Qt Company. For licensing terms
-## and conditions see https://www.qt.io/terms-conditions. For further
-## information use the contact form at https://www.qt.io/contact-us.
-##
-## GNU General Public License Usage
-## Alternatively, this file may be used under the terms of the GNU
-## General Public License version 3 as published by the Free Software
-## Foundation with exceptions as appearing in the file LICENSE.GPL3-EXCEPT
-## included in the packaging of this file. Please review the following
-## information to ensure the GNU General Public License requirements will
-## be met: https://www.gnu.org/licenses/gpl-3.0.html.
-##
-## $QT_END_LICENSE$
-##
-#############################################################################
-
-import sys
-import os
-import xml.dom.minidom
-
-from localetools import Error
-
-class DraftResolution:
- # See http://www.unicode.org/cldr/process.html for description
- unconfirmed = 'unconfirmed'
- provisional = 'provisional'
- contributed = 'contributed'
- approved = 'approved'
- _values = { unconfirmed : 1, provisional : 2, contributed : 3, approved : 4 }
- def __init__(self, resolution):
- self.resolution = resolution
- def toInt(self):
- return DraftResolution._values[self.resolution]
-
-doc_cache = {}
-def parseDoc(file):
- if not doc_cache.has_key(file):
- doc_cache[file] = xml.dom.minidom.parse(file)
- return doc_cache[file]
-
-def findChild(parent, tag_name, arg_name=None, arg_value=None, draft=None):
- for node in parent.childNodes:
- if node.nodeType != node.ELEMENT_NODE:
- continue
- if node.nodeName != tag_name:
- continue
- if arg_value:
- if not node.attributes.has_key(arg_name):
- continue
- if node.attributes[arg_name].nodeValue != arg_value:
- continue
- if draft:
- if not node.attributes.has_key('draft'):
- # if draft is not specified then it's approved
- return node
- value = node.attributes['draft'].nodeValue
- value = DraftResolution(value).toInt()
- exemplar = DraftResolution(draft).toInt()
- if exemplar > value:
- continue
- return node
- return False
-
-def codeMapsFromFile(file):
- """Extract mappings of language, script and country codes to names.
-
- The file shall typically be common/main/en.xml, which contains a
- localeDisplayNames element with children languages, scripts and
- territories; each element in each of these has a code as its type
- attribute and its name as element content. This returns a mapping
- withe keys 'language', 'script' and 'country', each of which
- has, as value, a mapping of the relevant codes to names.
- """
- parent = findChild(findChild(parseDoc(file), 'ldml'), 'localeDisplayNames')
- keys, result = {'languages': 'language', 'scripts': 'script', 'territories': 'country'}, {}
- for src, dst in keys.items():
- child = findChild(parent, src)
- data = result[dst] = {}
- for elt in child.childNodes:
- if elt.attributes and elt.attributes.has_key('type'):
- key, value = elt.attributes['type'].value, elt.childNodes[0].wholeText
- # Don't over-write previously-read data for an alt form:
- if elt.attributes.has_key('alt') and data.has_key(key):
- continue
- data[key] = value
-
- return result
-
-def findTagsInFile(file, path):
- doc = parseDoc(file)
-
- elt = doc.documentElement
- tag_spec_list = path.split("/")
- last_entry = None
- for tag_spec in tag_spec_list:
- tag_name = tag_spec
- arg_name = 'type'
- arg_value = ''
- left_bracket = tag_spec.find('[')
- if left_bracket != -1:
- tag_name = tag_spec[:left_bracket]
- arg_value = tag_spec[left_bracket+1:-1].split("=")
- if len(arg_value) == 2:
- arg_name = arg_value[0]
- arg_value = arg_value[1]
- else:
- arg_value = arg_value[0]
- elt = findChild(elt, tag_name, arg_name, arg_value)
- if not elt:
- return None
- ret = []
- if elt.childNodes:
- for node in elt.childNodes:
- if node.attributes:
- element = [node.nodeName, None]
- element[1] = node.attributes.items()
- ret.append(element)
- else:
- if elt.attributes:
- element = [elt.nodeName, None]
- element[1] = elt.attributes.items()
- ret.append(element)
- return ret
-
-def _findEntryInFile(file, path, draft=None, attribute=None):
- doc = parseDoc(file)
-
- elt = doc.documentElement
- tag_spec_list = path.split("/")
- last_entry = None
- for i in range(len(tag_spec_list)):
- tag_spec = tag_spec_list[i]
- tag_name = tag_spec
- arg_name = 'type'
- arg_value = ''
- left_bracket = tag_spec.find('[')
- if left_bracket != -1:
- tag_name = tag_spec[:left_bracket]
- arg_value = tag_spec[left_bracket+1:-1].split("=")
- if len(arg_value) == 2:
- arg_name = arg_value[0].replace("@", "").replace("'", "")
- arg_value = arg_value[1]
- else:
- arg_value = arg_value[0]
- alias = findChild(elt, 'alias')
- if alias and alias.attributes['source'].nodeValue == 'locale':
- path = alias.attributes['path'].nodeValue
- aliaspath = tag_spec_list[:i] + path.split("/")
- def resolve(x, y):
- if y == '..':
- return x[:-1]
- return x + [y]
- # resolve all dot-dot parts of the path
- aliaspath = reduce(resolve, aliaspath, [])
- # remove attribute specification that our xpathlite doesnt support
- aliaspath = map(lambda x: x.replace("@type=", "").replace("'", ""), aliaspath)
- # append the remaining path
- aliaspath = aliaspath + tag_spec_list[i:]
- aliaspath = "/".join(aliaspath)
- # "locale" aliases are special - we need to start lookup from scratch
- return (None, aliaspath)
- elt = findChild(elt, tag_name, arg_name, arg_value, draft)
- if not elt:
- return ("", None)
- if attribute is not None:
- if elt.attributes.has_key(attribute):
- return (elt.attributes[attribute].nodeValue, None)
- return (None, None)
- try:
- return (elt.firstChild.nodeValue, None)
- except:
- pass
- return (None, None)
-
-def findAlias(file):
- doc = parseDoc(file)
-
- alias_elt = findChild(doc.documentElement, "alias")
- if not alias_elt:
- return False
- if not alias_elt.attributes.has_key('source'):
- return False
- return alias_elt.attributes['source'].nodeValue
-
-lookup_chain_cache = {}
-parent_locales = {}
-def _fixedLookupChain(dirname, name):
- if lookup_chain_cache.has_key(name):
- return lookup_chain_cache[name]
-
- # see http://www.unicode.org/reports/tr35/#Parent_Locales
- if not parent_locales:
- for ns in findTagsInFile(dirname + "/../supplemental/supplementalData.xml", "parentLocales"):
- tmp = {}
- parent_locale = ""
- for data in ns[1:][0]: # ns looks like this: [u'parentLocale', [(u'parent', u'root'), (u'locales', u'az_Cyrl bs_Cyrl en_Dsrt ..')]]
- tmp[data[0]] = data[1]
- if data[0] == u"parent":
- parent_locale = data[1]
- parent_locales[parent_locale] = tmp[u"locales"].split(" ")
-
- items = name.split("_")
- # split locale name into items and iterate through them from back to front
- # example: az_Latn_AZ => [az_Latn_AZ, az_Latn, az]
- items = list(reversed(map(lambda x: "_".join(items[:x+1]), range(len(items)))))
-
- for i in range(len(items)):
- item = items[i]
- for parent_locale in parent_locales.keys():
- for locale in parent_locales[parent_locale]:
- if item == locale:
- if parent_locale == u"root":
- items = items[:i+1]
- else:
- items = items[:i+1] + _fixedLookupChain(dirname, parent_locale)
- lookup_chain_cache[name] = items
- return items
-
- lookup_chain_cache[name] = items
- return items
-
-def _findEntry(base, path, draft=None, attribute=None):
- if base.endswith(".xml"):
- base = base[:-4]
- (dirname, filename) = os.path.split(base)
-
- items = _fixedLookupChain(dirname, filename)
- for item in items:
- file = dirname + "/" + item + ".xml"
- if os.path.isfile(file):
- alias = findAlias(file)
- if alias:
- # if alias is found we should follow it and stop processing current file
- # see http://www.unicode.org/reports/tr35/#Common_Elements
- aliasfile = os.path.dirname(file) + "/" + alias + ".xml"
- if not os.path.isfile(aliasfile):
- raise Error("findEntry: fatal error: found an alias '%s' to '%s', but the alias file couldn't be found" % (filename, alias))
- # found an alias, recurse into parsing it
- result = _findEntry(aliasfile, path, draft, attribute)
- return result
- (result, aliaspath) = _findEntryInFile(file, path, draft, attribute)
- if aliaspath:
- # start lookup again because of the alias source="locale"
- return _findEntry(base, aliaspath, draft, attribute)
- if result:
- return result
- return None
-
-def findEntry(base, path, draft=None, attribute=None):
- file = base
- if base.endswith(".xml"):
- file = base
- base = base[:-4]
- else:
- file = base + ".xml"
- (dirname, filename) = os.path.split(base)
-
- result = None
- while path:
- result = _findEntry(base, path, draft, attribute)
- if result:
- return result
- (result, aliaspath) = _findEntryInFile(dirname + "/root.xml", path, draft, attribute)
- if result:
- return result
- if not aliaspath:
- raise Error("findEntry: fatal error: %s: cannot find key %s" % (filename, path))
- path = aliaspath
-
- return result
-