summaryrefslogtreecommitdiffstats
path: root/util/locale_database/ldml.py
diff options
context:
space:
mode:
Diffstat (limited to 'util/locale_database/ldml.py')
-rw-r--r--util/locale_database/ldml.py450
1 files changed, 432 insertions, 18 deletions
diff --git a/util/locale_database/ldml.py b/util/locale_database/ldml.py
index 4aaa728a86..ff94f3da73 100644
--- a/util/locale_database/ldml.py
+++ b/util/locale_database/ldml.py
@@ -39,10 +39,12 @@ returned by minidom.parse() and their child-nodes:
Node -- wraps any node in the DOM tree
XmlScanner -- wraps the root element of a stand-alone XML file
Supplement -- specializes XmlScanner for supplemental data files
+ LocaleScanner -- wraps a locale's inheritance-chain of file roots
See individual classes for further detail.
"""
from localetools import Error
+from dateconverter import convert_date
class Node (object):
"""Wrapper for an arbitrary DOM node.
@@ -51,11 +53,20 @@ class Node (object):
nodes are returned wrapped as Node objects. A Node exposes the
raw DOM node it wraps via its .dom attribute."""
- def __init__(self, elt):
+ def __init__(self, elt, draft = 0):
"""Wraps a DOM node for ease of access.
- Single argument, elt, is the DOM node to wrap."""
+ First argument, elt, is the DOM node to wrap. (Optional second
+ argument, draft, should only be supplied by this class's
+ creation of child nodes; it is the maximum draft score of any
+ ancestor of the new node.)"""
self.dom = elt
+ try:
+ attr = elt.attributes['draft'].nodeValue
+ except KeyError:
+ self.draft = draft
+ else:
+ self.draft = max(draft, self.draftScore(attr))
def findAllChildren(self, tag, wanted = None):
"""All children that do have the given tag and attributes.
@@ -65,34 +76,60 @@ class Node (object):
Optional second argument, wanted, should either be None or map
attribute names to the values they must have. Only child nodes
- with these attributes set to the given values are yielded."""
+ with thes attributes set to the given values are yielded."""
- cutoff = 4 # Only accept approved, for now
for child in self.dom.childNodes:
if child.nodeType != child.ELEMENT_NODE:
continue
if child.nodeName != tag:
continue
- try:
- draft = child.attributes['draft']
- except KeyError:
- pass
- else:
- if self.__draftScores.get(draft, 0) < cutoff:
- continue
-
- if wanted is not None:
+ if wanted:
try:
- if wanted and any(child.attributes[k].nodeValue != v for k, v in wanted.items()):
+ if any(child.attributes[k].nodeValue != v
+ for k, v in wanted.items()):
continue
except KeyError: # Some wanted attribute is missing
continue
- yield Node(child)
+ yield Node(child, self.draft)
+
+ def findUniqueChild(self, tag):
+ """Returns the single child with the given nodeName.
+
+ Raises Error if there is no such child or there is more than
+ one."""
+ seq = self.findAllChildren(tag)
+ try:
+ node = seq.next()
+ except StopIteration:
+ raise Error('No child found where one was expected', tag)
+ for it in seq:
+ raise Error('Many children found where only one was expected', tag)
+ return node
+
+ @classmethod
+ def draftScore(cls, level):
+ """Maps draft level names to numeric scores.
+
+ Single parameter, level, is the least sure value of the draft
+ attribute on a node that you're willing to accept; returns a
+ numeric value (lower is less drafty).
- __draftScores = dict(true = 0, unconfirmed = 1, provisional = 2,
- contributed = 3, approved = 4, false = 4)
+ Tempting as it is to insist on low draft scores, there are
+ many locales in which pretty much every leaf is
+ unconfirmed. It may make sense to actually check each
+ XmlScanner object, or each node in each LocaleScanner's nodes
+ list, to see what its distribution of draft level looks like,
+ so as to set the acceptable draft score for its elements
+ accordingly. However, for the moment, we mostly just accept
+ all elements, regardless of draft values (the one exception is
+ am/pm indicators)."""
+ return cls.__draftScores.get(level, 5) if level else 0
+
+ # Implementation details:
+ __draftScores = dict(true = 4, unconfirmed = 3, provisional = 2,
+ contributed = 1, approved = 0, false = 0)
def _parseXPath(selector):
# Split "tag[attr=val][...]" into tag-name and attribute mapping
@@ -129,7 +166,6 @@ class XmlScanner (object):
return elts
class Supplement (XmlScanner):
- # Replaces xpathlite.findTagsInFile()
def find(self, xpath):
elts = self.findNodes(xpath)
for elt in _iterateEach(e.dom.childNodes if e.dom.childNodes else (e.dom,)
@@ -138,3 +174,381 @@ class Supplement (XmlScanner):
yield (elt.nodeName,
dict((k, v if isinstance(v, basestring) else v.nodeValue)
for k, v in elt.attributes.items()))
+
+class LocaleScanner (object):
+ def __init__(self, name, nodes, root):
+ self.name, self.nodes, self.base = name, nodes, root
+
+ def find(self, xpath, draft = None):
+ tags = xpath.split('/')
+ while True:
+ replace = None
+ for elt in self.nodes:
+ for selector in tags:
+ tag, attrs = _parseXPath(selector)
+ for elt in elt.findAllChildren(tag, attrs):
+ if draft is None or elt.draft <= draft:
+ break # and process the next selector
+ else:
+ break # no child, try next elt in self.nodes
+ else:
+ # processed all selectors
+ try:
+ return elt.dom.firstChild.nodeValue
+ except (AttributeError, KeyError):
+ pass # move on to next elt in self.nodes
+
+ # No match in self.nodes; check root
+ elt = self.base.root
+ for i, selector in enumerate(tags):
+ tag, attrs = _parseXPath(selector)
+ for alias in elt.findAllChildren('alias'):
+ if alias.dom.attributes['source'].nodeValue == 'locale':
+ replace = alias.dom.attributes['path'].nodeValue.split('/')
+ tags = self.__xpathJoin(tags[:i], replace, tags[i:])
+ break
+ else:
+ for elt in elt.findAllChildren(tag, attrs):
+ if draft is None or elt.draft <= draft:
+ break # and process the next selector
+ else:
+ break
+ if replace:
+ break
+ else:
+ # processed all selectors
+ try:
+ return elt.dom.firstChild.nodeValue
+ except (AttributeError, KeyError):
+ # No match
+ pass
+ if not replace:
+ break
+
+ sought = '/'.join(tags)
+ if sought != xpath:
+ sought += ' (for {})'.format(xpath)
+ raise Error('No {} in {}'.format(sought, self.name))
+
+ def findOr(self, xpath, fallback = ''):
+ """Use a fall-back value if we don't find data.
+
+ Like find, but takes a fall-back value to return instead of
+ raising Error on failure."""
+ try:
+ return self.find(xpath)
+ except Error:
+ return fallback
+
+ def tagCodes(self):
+ """Yields four tag codes
+
+ The tag codes are language, script, country and variant; an
+ empty value for any of them indicates that no value was
+ provided. The values are obtained from the primary file's
+ top-level <identity> element. An Error is raised if any
+ top-level <alias> element of this file has a non-empty source
+ attribute; that attribute value is mentioned in the error's
+ message."""
+ root = self.nodes[0]
+ for alias in root.findAllChildren('alias'):
+ try:
+ source = alias.dom.attributes['source'].nodeValue
+ except (KeyError, AttributeError):
+ pass
+ else:
+ raise Error('Alias to {}'.format(source))
+
+ ids = root.findUniqueChild('identity')
+ for code in ('language', 'script', 'territory', 'variant'):
+ for node in ids.findAllChildren(code):
+ try:
+ yield node.dom.attributes['type'].nodeValue
+ except (KeyError, AttributeError):
+ pass
+ else:
+ break # only want one value for each code
+ else: # No value for this code, use empty
+ yield ''
+
+ def currencyData(self, isoCode):
+ """Fetches currency data for this locale.
+
+ Single argument, isoCode, is the ISO currency code for the
+ currency in use in the country. See also numericData, which
+ includes some currency formats.
+ """
+ if isoCode:
+ stem = 'numbers/currencies/currency[{}]/'.format(isoCode)
+ symbol = self.findOr(stem + 'symbol')
+ name = ';'.join(
+ self.findOr(stem + 'displayName' + tail)
+ for tail in ('',) + tuple(
+ '[count={}]'.format(x) for x in ('zero', 'one', 'two', 'few', 'many', 'other')
+ )) + ';'
+ else:
+ symbol = name = ''
+ yield 'currencySymbol', symbol
+ yield 'currencyDisplayName', name
+
+ def numericData(self, lookup, complain = lambda text: None):
+ """Generate assorted numeric data for the locale.
+
+ First argument, lookup, is a callable that maps a numbering
+ system's name to certain data about the system, as a mapping;
+ we expect this to have u'digits' as a key.
+ """
+ system = self.find('numbers/defaultNumberingSystem')
+ stem = 'numbers/symbols[numberSystem={}]/'.format(system)
+ decimal = self.find(stem + 'decimal')
+ group = self.find(stem + 'group')
+ assert decimal != group, (self.name, system, decimal)
+ yield 'decimal', decimal
+ yield 'group', group
+ yield 'percent', self.find(stem + 'percentSign')
+ yield 'list', self.find(stem + 'list')
+ # FIXME: don't lower-case:
+ yield 'exp', self.find(stem + 'exponential').lower()
+
+ digits = lookup(system)['digits']
+ assert len(digits) == 10
+ zero = digits[0]
+ # Qt's number-formatting code assumes digits are consecutive:
+ assert all(ord(c) == i for i, c in enumerate(digits, ord(zero)))
+ yield 'zero', zero
+
+ plus = self.find(stem + 'plusSign')
+ minus = self.find(stem + 'minusSign')
+ yield 'plus', plus
+ yield 'minus', minus
+
+ # Currency formatting (currencyFormat may have a type field):
+ money = self.find('numbers/currencyFormats/currencyFormatLength/currencyFormat/pattern')
+ money = self.__currencyFormats(money, plus, minus)
+ yield 'currencyFormat', money.next()
+ neg = ''
+ for it in money:
+ assert not neg, 'There should be at most one more pattern'
+ neg = it
+ yield 'currencyNegativeFormat', neg
+
+ def textPatternData(self):
+ for key in ('quotationStart', 'alternateQuotationEnd',
+ 'quotationEnd', 'alternateQuotationStart'):
+ yield key, self.find('delimiters/' + key)
+
+ for key in ('start', 'middle', 'end'):
+ yield ('listPatternPart' + key.capitalize(),
+ self.__fromLdmlListPattern(self.find(
+ 'listPatterns/listPattern/listPatternPart[{}]'.format(key))))
+ yield ('listPatternPartTwo',
+ self.__fromLdmlListPattern(self.find(
+ 'listPatterns/listPattern/listPatternPart[2]')))
+
+ stem = 'dates/calendars/calendar[gregorian]/'
+ # TODO: is wide really the right width to use here ?
+ # abbreviated might be an option ... or try both ?
+ meridiem = stem + 'dayPeriods/dayPeriodContext[format]/dayPeriodWidth[wide]/'
+ for key in ('am', 'pm'):
+ yield key, self.find(meridiem + 'dayPeriod[{}]'.format(key),
+ draft = Node.draftScore('contributed'))
+
+ for pair in (('long', 'full'), ('short', 'short')):
+ for key in ('time', 'date'):
+ yield (pair[0] + key.capitalize() + 'Format',
+ convert_date(self.find(
+ stem + '{}Formats/{}FormatLength[{}]/{}Format/pattern'.format(
+ key, key, pair[1], key))))
+
+ def endonyms(self, language, script, country, variant):
+ # TODO: take variant into account ?
+ for seq in ((language, script, country),
+ (language, script), (language, country), (language,)):
+ if not all(seq):
+ continue
+ try:
+ yield ('languageEndonym',
+ self.find('localeDisplayNames/languages/language[{}]'
+ .format('_'.join(seq))))
+ except Error:
+ pass
+ else:
+ break
+ else:
+ # grumble(failed to find endonym for language)
+ yield 'languageEndonym', ''
+
+ yield ('countryEndonym',
+ self.findOr('localeDisplayNames/territories/territory[{}]'
+ .format(country)))
+
+ def unitData(self):
+ yield ('byte_unit',
+ self.findOr('units/unitLength[long]/unit[digital-byte]/displayName',
+ 'bytes'))
+
+ unit = self.__findUnit('', 'B')
+ cache = [] # Populated by the SI call, to give hints to the IEC call
+ yield ('byte_si_quantified',
+ ';'.join(self.__unitCount('', unit, cache)))
+ # IEC 60027-2
+ # http://physics.nist.gov/cuu/Units/binary.html
+ yield ('byte_iec_quantified',
+ ';'.join(self.__unitCount('bi', 'iB', cache)))
+
+ def calendarNames(self, calendars):
+ namings = self.__nameForms
+ for cal in calendars:
+ stem = 'dates/calendars/calendar[' + cal + ']/months/'
+ for key, mode, size in namings:
+ prop = 'monthContext[' + mode + ']/monthWidth[' + size + ']/'
+ yield (key + 'Months_' + cal,
+ ';'.join(self.find(stem + prop + 'month[{}]'.format(i))
+ for i in range(1, 13)) + ';')
+
+ # Day data (for Gregorian, at least):
+ stem = 'dates/calendars/calendar[gregorian]/days/'
+ days = ('sun', 'mon', 'tue', 'wed', 'thu', 'fri', 'sat')
+ for (key, mode, size) in namings:
+ prop = 'dayContext[' + mode + ']/dayWidth[' + size + ']/day'
+ yield (key + 'Days',
+ ';'.join(self.find(stem + prop + '[' + day + ']')
+ for day in days) + ';')
+
+ # Implementation details
+ __nameForms = (
+ ('standaloneLong', 'stand-alone', 'wide'),
+ ('standaloneShort', 'stand-alone', 'abbreviated'),
+ ('standaloneNarrow', 'stand-alone', 'narrow'),
+ ('long', 'format', 'wide'),
+ ('short', 'format', 'abbreviated'),
+ ('narrow', 'format', 'narrow'),
+ ) # Used for month and day names
+
+ def __findUnit(self, keySuffix, quantify, fallback=''):
+ # The displayName for a quantified unit in en.xml is kByte
+ # (even for unitLength[narrow]) instead of kB (etc.), so
+ # prefer any unitPattern provided, but prune its placeholder:
+ for size in ('short', 'narrow'): # TODO: reverse order ?
+ stem = 'units/unitLength[{}]/unit[digital-{}byte]/'.format(size + keySuffix, quantify)
+ for count in ('many', 'few', 'two', 'other', 'zero', 'one'):
+ try:
+ ans = self.find(stem + 'unitPattern[count={}]'.format(count))
+ except Error:
+ continue
+
+ # TODO: do count-handling, instead of discarding placeholders
+ if False: # TODO: do it this way, instead !
+ ans = ans.replace('{0}', '').strip()
+ elif ans.startswith('{0}'):
+ ans = ans[3:].lstrip()
+ if ans:
+ return ans
+
+ try:
+ return self.find(stem + 'displayName')
+ except Error:
+ pass
+
+ return fallback
+
+ def __unitCount(self, keySuffix, suffix, cache,
+ # Stop at exa/exbi: 16 exbi = 2^{64} < zetta =
+ # 1000^7 < zebi = 2^{70}, the next quantifiers up:
+ siQuantifiers = ('kilo', 'mega', 'giga', 'tera', 'peta', 'exa')):
+ """Work out the unit quantifiers.
+
+ Unfortunately, the CLDR data only go up to terabytes and we
+ want all the way to exabytes; but we can recognize the SI
+ quantifiers as prefixes, strip and identify the tail as the
+ localized translation for 'B' (e.g. French has 'octet' for
+ 'byte' and uses ko, Mo, Go, To from which we can extrapolate
+ Po, Eo).
+
+ Should be called first for the SI quantifiers, with suffix =
+ 'B', then for the IEC ones, with suffix = 'iB'; the list cache
+ (initially empty before first call) is used to let the second
+ call know what the first learned about the localized unit.
+ """
+ if suffix == 'iB': # second call, re-using first's cache
+ if cache:
+ byte = cache.pop()
+ if all(byte == k for k in cache):
+ suffix = 'i' + byte
+ for q in siQuantifiers:
+ # Those don't (yet, v36) exist in CLDR, so we always get the fall-back:
+ yield self.__findUnit(keySuffix, q[:2], q[0].upper() + suffix)
+ else: # first call
+ tail = suffix = suffix or 'B'
+ for q in siQuantifiers:
+ it = self.__findUnit(keySuffix, q)
+ # kB for kilobyte, in contrast with KiB for IEC:
+ q = q[0] if q == 'kilo' else q[0].upper()
+ if not it:
+ it = q + tail
+ elif it.startswith(q):
+ rest = it[1:]
+ tail = rest if all(rest == k for k in cache) else suffix
+ cache.append(rest)
+ yield it
+
+ @staticmethod
+ def __currencyFormats(patterns, plus, minus):
+ for p in patterns.split(';'):
+ p = p.replace('0', '#').replace(',', '').replace('.', '')
+ try:
+ cut = p.find('#') + 1
+ except ValueError:
+ pass
+ else:
+ p = p[:cut] + p[cut:].replace('#', '')
+ p = p.replace('#', "%1")
+ # According to http://www.unicode.org/reports/tr35/#Number_Format_Patterns
+ # there can be doubled or trippled currency sign, however none of the
+ # locales use that.
+ p = p.replace(u'\xa4', "%2")
+ # Single quote goes away, but double goes to single:
+ p = p.replace("''", '###').replace("'", '').replace('###', "'")
+ # Use number system's signs:
+ p = p.replace('+', plus).replace('-', minus)
+ yield p
+
+ @staticmethod
+ def __fromLdmlListPattern(pattern):
+ # This is a very limited parsing of the format for list pattern part only.
+ return pattern.replace('{0}', '%1').replace('{1}', '%2').replace('{2}', '%3')
+
+ @staticmethod
+ def __fromLdmlPath(seq): # tool function for __xpathJoin()
+ """Convert LDML's [@name='value'] to our [name=value] form."""
+ for it in seq:
+ # First dismember it:
+ attrs = it.split('[')
+ tag = attrs.pop(0)
+ if not attrs: # Short-cut the easy case:
+ yield it
+ continue
+
+ assert all(x.endswith(']') for x in attrs)
+ attrs = [x[:-1].split('=') for x in attrs]
+ # Then fix each attribute specification in it:
+ attrs = [(x[0][1:] if x[0].startswith('@') else x[0],
+ x[1][1:-1] if x[1].startswith("'") and x[1].endswith("'") else x[1])
+ for x in attrs]
+ # Finally, put it all back together:
+ attrs = ['='.join(x) + ']' for x in attrs]
+ attrs.insert(0, tag)
+ yield '['.join(attrs)
+
+ @classmethod
+ def __xpathJoin(cls, head, insert, tail):
+ """Join three lists of XPath selectors.
+
+ Each of head, insert and tail is a sequence of selectors but
+ insert may start with some uses of '..', that we want to
+ resolve away, and may use LDML's attribute format, that we
+ want to convert to our format."""
+ while insert and insert[0] == '..':
+ insert.pop(0)
+ head.pop()
+ return head + list(cls.__fromLdmlPath(insert)) + tail