summaryrefslogtreecommitdiffstats
path: root/util/locale_database/ldml.py
diff options
context:
space:
mode:
authorEdward Welbourne <edward.welbourne@qt.io>2020-02-12 11:15:27 +0100
committerEdward Welbourne <eddy@chaos.org.uk>2020-04-02 19:43:28 +0100
commit81cf23c7a75eeee5c10e108571fb4222552da5d3 (patch)
treeebcb3370de376c108c7789b1ace2d624ae6aa8cb /util/locale_database/ldml.py
parente5eb0aa4281016e7a2b3fd5c4381999c91438f5b (diff)
Take CLDR's distinguished attributes into account
When doing XPATH searches, child nodes that have distinguished attributes that were not asked for should be skipped. This is part of the LDML spec and matters when resolving locale inheritance. Scan the LDML DTD (previously only scanned for the CLDR version) to find which attributes of which tags are ignorable - all others are distinguished - and take the result into account when performing XPATH searches. The XPath we were using for currency formats wasn't excluding currencyFormatLength elements with type="short" and patterns specific to thousands (and larger multiples); this is fixed by taking distinguished attributes into account. However, the XPATH also wasn't specifying the always distinguished attribute type="standard" that was, in practice, used for nearly all locales that weren't (wrongly) using short-forms for thousands; so type="standard" is now made explicit, so as to minimize the diff. This leaves only twenty-one locales with a negative currency formats. A later commit shall switch to using accounting by default (it falls back via an alias to standard, in any case), thereby restoring the two mentioned below that were using it by accident, but the present change gives the minimal diff here. Thousands-specific formats replaced with sensible ones: * zh_Hant_{HK,MO} (Traditional Mandarin, Hong Kong and Macau) * eo_001 (Esperanto) * fr_CA (Canadian French) * ha_* (Hausa, when not written in Arabic) * es_{GT,MX,US} (Spanish - Guatemala, Mexico, USA) * sw_KE (Swahili, Kenya) * yi_001 (Yiddish) * mfe_MU (Morisyen, Mauritius) * lag_TZ (Langi, Tanzania) * mgh_MZ (Makhuwa Meetto, Mozambique) * wae_CH (Walser, Switzerland) * kkj_CM (Kako, Cameroon) * lkt_US (Lakota, USA) * pa_Arab_PK (Punjabi, in Arabic script, as used in Pakistan; uses arabext number system, whose currency falls back to latn's, for which pa_Arab over-rides the thousands-format). Format changed from an over-ridden type="accounting" to standard (so these lost a negative-specific form) in: * en_SI (English, Slovenia) * es_DO (Spanish, Dominican Republic; same) For some locales we were picking up over-rides of narrow or short list formats, or formats for or-lists or unit-lists rather than and-lists, in place of the standard list format, that these locales don't over-ride, provided by a parent locale. This changed list formats for: * en_CA, en_IN (dropped "Oxford" comma before "and") * qu_* (Quechua; dropped "utaq", presumably meaning "and") * ur_IN (Urdu, India; was using unit-list formats) [ChangeLog][QtCore][QLocale] Data used for currency formats in several locales and list patterns in some locales have changed due to now parsing the CLDR data more faithfully. Fixes: QTBUG-81344 Change-Id: I6b95c6c37db92df167153767c1b103becfb0ac98 Reviewed-by: Cristian Maureira-Fredes <cristian.maureira-fredes@qt.io>
Diffstat (limited to 'util/locale_database/ldml.py')
-rw-r--r--util/locale_database/ldml.py55
1 files changed, 40 insertions, 15 deletions
diff --git a/util/locale_database/ldml.py b/util/locale_database/ldml.py
index a4a8448a43..940264674b 100644
--- a/util/locale_database/ldml.py
+++ b/util/locale_database/ldml.py
@@ -53,14 +53,21 @@ class Node (object):
nodes are returned wrapped as Node objects. A Node exposes the
raw DOM node it wraps via its .dom attribute."""
- def __init__(self, elt, draft = 0):
+ def __init__(self, elt, dullAttrs = None, draft = 0):
"""Wraps a DOM node for ease of access.
- First argument, elt, is the DOM node to wrap. (Optional second
- argument, draft, should only be supplied by this class's
- creation of child nodes; it is the maximum draft score of any
- ancestor of the new node.)"""
- self.dom = elt
+ First argument, elt, is the DOM node to wrap.
+
+ Optional second argument, dullAttrs, should either be None or
+ map each LDML tag name to a list of the names of
+ non-distinguishing attributes for nodes with the given tag
+ name. If None is given, no distinguishing attribute checks are
+ performed.
+
+ (Optional third argument, draft, should only be supplied by
+ this class's creation of child nodes; it is the maximum draft
+ score of any ancestor of the new node.)"""
+ self.dom, self.__dull = elt, dullAttrs
try:
attr = elt.attributes['draft'].nodeValue
except KeyError:
@@ -68,7 +75,7 @@ class Node (object):
else:
self.draft = max(draft, self.draftScore(attr))
- def findAllChildren(self, tag, wanted = None):
+ def findAllChildren(self, tag, wanted = None, allDull = False):
"""All children that do have the given tag and attributes.
First argument is the tag: children with any other tag are
@@ -76,7 +83,15 @@ class Node (object):
Optional second argument, wanted, should either be None or map
attribute names to the values they must have. Only child nodes
- with thes attributes set to the given values are yielded."""
+ with thes attributes set to the given values are yielded.
+
+ By default, nodes that have distinguishing attributes, other
+ than those specified in wanted, are ignored. Pass the allDull
+ parameter a true value to suppress this check."""
+
+ if self.__dull is None:
+ allDull = True
+ dull = () if allDull else self.__dull[tag]
for child in self.dom.childNodes:
if child.nodeType != child.ELEMENT_NODE:
@@ -92,7 +107,15 @@ class Node (object):
except KeyError: # Some wanted attribute is missing
continue
- yield Node(child, self.draft)
+ if not (allDull or all(k in dull or k in wanted
+ for k in child.attributes.keys())):
+ continue
+
+ elif not (allDull or all(k in dull
+ for k in child.attributes.keys())):
+ continue
+
+ yield Node(child, self.__dull, self.draft)
def findUniqueChild(self, tag):
"""Returns the single child with the given nodeName.
@@ -156,7 +179,9 @@ class XmlScanner (object):
self.root = node
def findNodes(self, xpath):
- """Return all nodes under self.root matching this xpath"""
+ """Return all nodes under self.root matching this xpath.
+
+ Ignores any excess attributes."""
elts = (self.root,)
for selector in xpath.split('/'):
tag, attrs = _parseXPath(selector)
@@ -202,7 +227,7 @@ class LocaleScanner (object):
elt = self.base.root
for i, selector in enumerate(tags):
tag, attrs = _parseXPath(selector)
- for alias in elt.findAllChildren('alias'):
+ for alias in elt.findAllChildren('alias', allDull = True):
if alias.dom.attributes['source'].nodeValue == 'locale':
replace = alias.dom.attributes['path'].nodeValue.split('/')
tags = self.__xpathJoin(tags[:i], replace, tags[i:])
@@ -251,7 +276,7 @@ class LocaleScanner (object):
attribute; that attribute value is mentioned in the error's
message."""
root = self.nodes[0]
- for alias in root.findAllChildren('alias'):
+ for alias in root.findAllChildren('alias', allDull=True):
try:
source = alias.dom.attributes['source'].nodeValue
except (KeyError, AttributeError):
@@ -261,7 +286,7 @@ class LocaleScanner (object):
ids = root.findUniqueChild('identity')
for code in ('language', 'script', 'territory', 'variant'):
- for node in ids.findAllChildren(code):
+ for node in ids.findAllChildren(code, allDull=True):
try:
yield node.dom.attributes['type'].nodeValue
except (KeyError, AttributeError):
@@ -322,8 +347,8 @@ class LocaleScanner (object):
yield 'plus', plus
yield 'minus', minus
- # Currency formatting (currencyFormat may have a type field):
- xpath = 'numbers/currencyFormats/currencyFormatLength/currencyFormat/pattern'
+ # Currency formatting:
+ xpath = 'numbers/currencyFormats/currencyFormatLength/currencyFormat[standard]/pattern'
try:
money = self.find(xpath.replace('Formats/',
'Formats[numberSystem={}]/'.format(system)))