summaryrefslogtreecommitdiffstats
path: root/util
diff options
context:
space:
mode:
authorEdward Welbourne <edward.welbourne@qt.io>2020-02-12 11:15:27 +0100
committerEdward Welbourne <eddy@chaos.org.uk>2020-04-02 19:43:28 +0100
commit81cf23c7a75eeee5c10e108571fb4222552da5d3 (patch)
treeebcb3370de376c108c7789b1ace2d624ae6aa8cb /util
parente5eb0aa4281016e7a2b3fd5c4381999c91438f5b (diff)
Take CLDR's distinguished attributes into account
When doing XPATH searches, child nodes that have distinguished attributes that were not asked for should be skipped. This is part of the LDML spec and matters when resolving locale inheritance. Scan the LDML DTD (previously only scanned for the CLDR version) to find which attributes of which tags are ignorable - all others are distinguished - and take the result into account when performing XPATH searches. The XPath we were using for currency formats wasn't excluding currencyFormatLength elements with type="short" and patterns specific to thousands (and larger multiples); this is fixed by taking distinguished attributes into account. However, the XPATH also wasn't specifying the always distinguished attribute type="standard" that was, in practice, used for nearly all locales that weren't (wrongly) using short-forms for thousands; so type="standard" is now made explicit, so as to minimize the diff. This leaves only twenty-one locales with a negative currency formats. A later commit shall switch to using accounting by default (it falls back via an alias to standard, in any case), thereby restoring the two mentioned below that were using it by accident, but the present change gives the minimal diff here. Thousands-specific formats replaced with sensible ones: * zh_Hant_{HK,MO} (Traditional Mandarin, Hong Kong and Macau) * eo_001 (Esperanto) * fr_CA (Canadian French) * ha_* (Hausa, when not written in Arabic) * es_{GT,MX,US} (Spanish - Guatemala, Mexico, USA) * sw_KE (Swahili, Kenya) * yi_001 (Yiddish) * mfe_MU (Morisyen, Mauritius) * lag_TZ (Langi, Tanzania) * mgh_MZ (Makhuwa Meetto, Mozambique) * wae_CH (Walser, Switzerland) * kkj_CM (Kako, Cameroon) * lkt_US (Lakota, USA) * pa_Arab_PK (Punjabi, in Arabic script, as used in Pakistan; uses arabext number system, whose currency falls back to latn's, for which pa_Arab over-rides the thousands-format). Format changed from an over-ridden type="accounting" to standard (so these lost a negative-specific form) in: * en_SI (English, Slovenia) * es_DO (Spanish, Dominican Republic; same) For some locales we were picking up over-rides of narrow or short list formats, or formats for or-lists or unit-lists rather than and-lists, in place of the standard list format, that these locales don't over-ride, provided by a parent locale. This changed list formats for: * en_CA, en_IN (dropped "Oxford" comma before "and") * qu_* (Quechua; dropped "utaq", presumably meaning "and") * ur_IN (Urdu, India; was using unit-list formats) [ChangeLog][QtCore][QLocale] Data used for currency formats in several locales and list patterns in some locales have changed due to now parsing the CLDR data more faithfully. Fixes: QTBUG-81344 Change-Id: I6b95c6c37db92df167153767c1b103becfb0ac98 Reviewed-by: Cristian Maureira-Fredes <cristian.maureira-fredes@qt.io>
Diffstat (limited to 'util')
-rw-r--r--util/locale_database/cldr.py63
-rw-r--r--util/locale_database/ldml.py55
2 files changed, 97 insertions, 21 deletions
diff --git a/util/locale_database/cldr.py b/util/locale_database/cldr.py
index 94459b9e3f..0cc2560632 100644
--- a/util/locale_database/cldr.py
+++ b/util/locale_database/cldr.py
@@ -439,7 +439,7 @@ class CldrAccess (object):
@property
def cldrVersion(self):
# Evaluate so as to ensure __cldrVersion is set:
- self.__scanLdmlDtd()
+ self.__unDistinguishedAttributes
return self.__cldrVersion
# Implementation details
@@ -545,17 +545,68 @@ class CldrAccess (object):
return cache
+ @property
+ def __unDistinguishedAttributes(self, cache = {}, joinPath = os.path.join):
+ """Mapping from tag names to lists of attributes.
+
+ LDML defines some attributes as 'distinguishing': if a node
+ has distinguishing attributes that weren't specified in an
+ XPath, a search on that XPath should exclude the node's
+ children.
+
+ This property is a mapping from tag names to tuples of
+ attribute names that *aren't* distinguishing for that tag.
+ Its value is cached (so its costly computation isonly done
+ once) and there's a side-effect of populating its cache: it
+ sets self.__cldrVersion to the value found in ldml.dtd, during
+ parsing."""
+ if not cache:
+ cache.update(self.__scanLdmlDtd())
+ assert cache
+
+ return cache
+
def __scanLdmlDtd(self, joinPath = os.path.join):
- """Scan the LDML DTD, record CLDR version."""
+ """Scan the LDML DTD, record CLDR version
+
+ Yields (tag, attrs) pairs: on elements with a given tag,
+ attributes named in its attrs (a tuple) may be ignored in an
+ XPath search; other attributes are distinguished attributes,
+ in the terminology of LDML's locale-inheritance rules.
+
+ Sets self.__cldrVersion as a side-effect, since this
+ information is found in the same file."""
with self.__open(('common', 'dtd', 'ldml.dtd')) as dtd:
+ tag, ignored, last = None, None, None
+
for line in dtd:
+ if line.startswith('<!ELEMENT '):
+ if ignored:
+ assert tag
+ yield tag, tuple(ignored)
+ tag, ignored, last = line.split()[1], [], None
+ continue
+
if line.startswith('<!ATTLIST '):
+ assert tag is not None
parts = line.split()
+ assert parts[1] == tag
+ last = parts[2]
if parts[1:5] == ['version', 'cldrVersion', 'CDATA', '#FIXED']:
- # parts[5] is the version, in quotes, maybe
- # with a final > attached to its end:
+ # parts[5] is the version, in quotes, although the final > might be stuck on its end:
self.__cldrVersion = parts[5].split('"')[1]
- break
+ continue
+
+ # <!ELEMENT...>s can also be @METADATA, but not @VALUE:
+ if '<!--@VALUE-->' in line or (last and '<!--@METADATA-->' in line):
+ assert last is not None
+ assert ignored is not None
+ assert tag is not None
+ ignored.append(last)
+ last = None # No attribute is both value and metadata
+
+ if tag and ignored:
+ yield tag, tuple(ignored)
def __enumMap(self, key, cache = {}):
if not cache:
@@ -650,7 +701,7 @@ class CldrAccess (object):
while name and name != 'root':
doc = self.__localeAsDoc(name)
if doc is not None:
- yield Node(doc)
+ yield Node(doc, self.__unDistinguishedAttributes)
try:
name = self.__parentLocale(name)
diff --git a/util/locale_database/ldml.py b/util/locale_database/ldml.py
index a4a8448a43..940264674b 100644
--- a/util/locale_database/ldml.py
+++ b/util/locale_database/ldml.py
@@ -53,14 +53,21 @@ class Node (object):
nodes are returned wrapped as Node objects. A Node exposes the
raw DOM node it wraps via its .dom attribute."""
- def __init__(self, elt, draft = 0):
+ def __init__(self, elt, dullAttrs = None, draft = 0):
"""Wraps a DOM node for ease of access.
- First argument, elt, is the DOM node to wrap. (Optional second
- argument, draft, should only be supplied by this class's
- creation of child nodes; it is the maximum draft score of any
- ancestor of the new node.)"""
- self.dom = elt
+ First argument, elt, is the DOM node to wrap.
+
+ Optional second argument, dullAttrs, should either be None or
+ map each LDML tag name to a list of the names of
+ non-distinguishing attributes for nodes with the given tag
+ name. If None is given, no distinguishing attribute checks are
+ performed.
+
+ (Optional third argument, draft, should only be supplied by
+ this class's creation of child nodes; it is the maximum draft
+ score of any ancestor of the new node.)"""
+ self.dom, self.__dull = elt, dullAttrs
try:
attr = elt.attributes['draft'].nodeValue
except KeyError:
@@ -68,7 +75,7 @@ class Node (object):
else:
self.draft = max(draft, self.draftScore(attr))
- def findAllChildren(self, tag, wanted = None):
+ def findAllChildren(self, tag, wanted = None, allDull = False):
"""All children that do have the given tag and attributes.
First argument is the tag: children with any other tag are
@@ -76,7 +83,15 @@ class Node (object):
Optional second argument, wanted, should either be None or map
attribute names to the values they must have. Only child nodes
- with thes attributes set to the given values are yielded."""
+ with thes attributes set to the given values are yielded.
+
+ By default, nodes that have distinguishing attributes, other
+ than those specified in wanted, are ignored. Pass the allDull
+ parameter a true value to suppress this check."""
+
+ if self.__dull is None:
+ allDull = True
+ dull = () if allDull else self.__dull[tag]
for child in self.dom.childNodes:
if child.nodeType != child.ELEMENT_NODE:
@@ -92,7 +107,15 @@ class Node (object):
except KeyError: # Some wanted attribute is missing
continue
- yield Node(child, self.draft)
+ if not (allDull or all(k in dull or k in wanted
+ for k in child.attributes.keys())):
+ continue
+
+ elif not (allDull or all(k in dull
+ for k in child.attributes.keys())):
+ continue
+
+ yield Node(child, self.__dull, self.draft)
def findUniqueChild(self, tag):
"""Returns the single child with the given nodeName.
@@ -156,7 +179,9 @@ class XmlScanner (object):
self.root = node
def findNodes(self, xpath):
- """Return all nodes under self.root matching this xpath"""
+ """Return all nodes under self.root matching this xpath.
+
+ Ignores any excess attributes."""
elts = (self.root,)
for selector in xpath.split('/'):
tag, attrs = _parseXPath(selector)
@@ -202,7 +227,7 @@ class LocaleScanner (object):
elt = self.base.root
for i, selector in enumerate(tags):
tag, attrs = _parseXPath(selector)
- for alias in elt.findAllChildren('alias'):
+ for alias in elt.findAllChildren('alias', allDull = True):
if alias.dom.attributes['source'].nodeValue == 'locale':
replace = alias.dom.attributes['path'].nodeValue.split('/')
tags = self.__xpathJoin(tags[:i], replace, tags[i:])
@@ -251,7 +276,7 @@ class LocaleScanner (object):
attribute; that attribute value is mentioned in the error's
message."""
root = self.nodes[0]
- for alias in root.findAllChildren('alias'):
+ for alias in root.findAllChildren('alias', allDull=True):
try:
source = alias.dom.attributes['source'].nodeValue
except (KeyError, AttributeError):
@@ -261,7 +286,7 @@ class LocaleScanner (object):
ids = root.findUniqueChild('identity')
for code in ('language', 'script', 'territory', 'variant'):
- for node in ids.findAllChildren(code):
+ for node in ids.findAllChildren(code, allDull=True):
try:
yield node.dom.attributes['type'].nodeValue
except (KeyError, AttributeError):
@@ -322,8 +347,8 @@ class LocaleScanner (object):
yield 'plus', plus
yield 'minus', minus
- # Currency formatting (currencyFormat may have a type field):
- xpath = 'numbers/currencyFormats/currencyFormatLength/currencyFormat/pattern'
+ # Currency formatting:
+ xpath = 'numbers/currencyFormats/currencyFormatLength/currencyFormat[standard]/pattern'
try:
money = self.find(xpath.replace('Formats/',
'Formats[numberSystem={}]/'.format(system)))