diff options
author | Edward Welbourne <edward.welbourne@qt.io> | 2020-02-12 11:15:27 +0100 |
---|---|---|
committer | Edward Welbourne <eddy@chaos.org.uk> | 2020-04-02 19:43:28 +0100 |
commit | 81cf23c7a75eeee5c10e108571fb4222552da5d3 (patch) | |
tree | ebcb3370de376c108c7789b1ace2d624ae6aa8cb /util/locale_database/ldml.py | |
parent | e5eb0aa4281016e7a2b3fd5c4381999c91438f5b (diff) |
Take CLDR's distinguished attributes into account
When doing XPATH searches, child nodes that have distinguished
attributes that were not asked for should be skipped. This is part of
the LDML spec and matters when resolving locale inheritance. Scan the
LDML DTD (previously only scanned for the CLDR version) to find which
attributes of which tags are ignorable - all others are distinguished
- and take the result into account when performing XPATH searches.
The XPath we were using for currency formats wasn't excluding
currencyFormatLength elements with type="short" and patterns specific
to thousands (and larger multiples); this is fixed by taking
distinguished attributes into account. However, the XPATH also wasn't
specifying the always distinguished attribute type="standard" that
was, in practice, used for nearly all locales that weren't (wrongly)
using short-forms for thousands; so type="standard" is now made
explicit, so as to minimize the diff.
This leaves only twenty-one locales with a negative currency formats.
A later commit shall switch to using accounting by default (it falls
back via an alias to standard, in any case), thereby restoring the two
mentioned below that were using it by accident, but the present change
gives the minimal diff here.
Thousands-specific formats replaced with sensible ones:
* zh_Hant_{HK,MO} (Traditional Mandarin, Hong Kong and Macau)
* eo_001 (Esperanto)
* fr_CA (Canadian French)
* ha_* (Hausa, when not written in Arabic)
* es_{GT,MX,US} (Spanish - Guatemala, Mexico, USA)
* sw_KE (Swahili, Kenya)
* yi_001 (Yiddish)
* mfe_MU (Morisyen, Mauritius)
* lag_TZ (Langi, Tanzania)
* mgh_MZ (Makhuwa Meetto, Mozambique)
* wae_CH (Walser, Switzerland)
* kkj_CM (Kako, Cameroon)
* lkt_US (Lakota, USA)
* pa_Arab_PK (Punjabi, in Arabic script, as used in Pakistan; uses
arabext number system, whose currency falls back to latn's, for
which pa_Arab over-rides the thousands-format).
Format changed from an over-ridden type="accounting" to standard (so
these lost a negative-specific form) in:
* en_SI (English, Slovenia)
* es_DO (Spanish, Dominican Republic; same)
For some locales we were picking up over-rides of narrow or short list
formats, or formats for or-lists or unit-lists rather than and-lists,
in place of the standard list format, that these locales don't
over-ride, provided by a parent locale. This changed list formats for:
* en_CA, en_IN (dropped "Oxford" comma before "and")
* qu_* (Quechua; dropped "utaq", presumably meaning "and")
* ur_IN (Urdu, India; was using unit-list formats)
[ChangeLog][QtCore][QLocale] Data used for currency formats in several
locales and list patterns in some locales have changed due to now
parsing the CLDR data more faithfully.
Fixes: QTBUG-81344
Change-Id: I6b95c6c37db92df167153767c1b103becfb0ac98
Reviewed-by: Cristian Maureira-Fredes <cristian.maureira-fredes@qt.io>
Diffstat (limited to 'util/locale_database/ldml.py')
-rw-r--r-- | util/locale_database/ldml.py | 55 |
1 files changed, 40 insertions, 15 deletions
diff --git a/util/locale_database/ldml.py b/util/locale_database/ldml.py index a4a8448a43..940264674b 100644 --- a/util/locale_database/ldml.py +++ b/util/locale_database/ldml.py @@ -53,14 +53,21 @@ class Node (object): nodes are returned wrapped as Node objects. A Node exposes the raw DOM node it wraps via its .dom attribute.""" - def __init__(self, elt, draft = 0): + def __init__(self, elt, dullAttrs = None, draft = 0): """Wraps a DOM node for ease of access. - First argument, elt, is the DOM node to wrap. (Optional second - argument, draft, should only be supplied by this class's - creation of child nodes; it is the maximum draft score of any - ancestor of the new node.)""" - self.dom = elt + First argument, elt, is the DOM node to wrap. + + Optional second argument, dullAttrs, should either be None or + map each LDML tag name to a list of the names of + non-distinguishing attributes for nodes with the given tag + name. If None is given, no distinguishing attribute checks are + performed. + + (Optional third argument, draft, should only be supplied by + this class's creation of child nodes; it is the maximum draft + score of any ancestor of the new node.)""" + self.dom, self.__dull = elt, dullAttrs try: attr = elt.attributes['draft'].nodeValue except KeyError: @@ -68,7 +75,7 @@ class Node (object): else: self.draft = max(draft, self.draftScore(attr)) - def findAllChildren(self, tag, wanted = None): + def findAllChildren(self, tag, wanted = None, allDull = False): """All children that do have the given tag and attributes. First argument is the tag: children with any other tag are @@ -76,7 +83,15 @@ class Node (object): Optional second argument, wanted, should either be None or map attribute names to the values they must have. Only child nodes - with thes attributes set to the given values are yielded.""" + with thes attributes set to the given values are yielded. + + By default, nodes that have distinguishing attributes, other + than those specified in wanted, are ignored. Pass the allDull + parameter a true value to suppress this check.""" + + if self.__dull is None: + allDull = True + dull = () if allDull else self.__dull[tag] for child in self.dom.childNodes: if child.nodeType != child.ELEMENT_NODE: @@ -92,7 +107,15 @@ class Node (object): except KeyError: # Some wanted attribute is missing continue - yield Node(child, self.draft) + if not (allDull or all(k in dull or k in wanted + for k in child.attributes.keys())): + continue + + elif not (allDull or all(k in dull + for k in child.attributes.keys())): + continue + + yield Node(child, self.__dull, self.draft) def findUniqueChild(self, tag): """Returns the single child with the given nodeName. @@ -156,7 +179,9 @@ class XmlScanner (object): self.root = node def findNodes(self, xpath): - """Return all nodes under self.root matching this xpath""" + """Return all nodes under self.root matching this xpath. + + Ignores any excess attributes.""" elts = (self.root,) for selector in xpath.split('/'): tag, attrs = _parseXPath(selector) @@ -202,7 +227,7 @@ class LocaleScanner (object): elt = self.base.root for i, selector in enumerate(tags): tag, attrs = _parseXPath(selector) - for alias in elt.findAllChildren('alias'): + for alias in elt.findAllChildren('alias', allDull = True): if alias.dom.attributes['source'].nodeValue == 'locale': replace = alias.dom.attributes['path'].nodeValue.split('/') tags = self.__xpathJoin(tags[:i], replace, tags[i:]) @@ -251,7 +276,7 @@ class LocaleScanner (object): attribute; that attribute value is mentioned in the error's message.""" root = self.nodes[0] - for alias in root.findAllChildren('alias'): + for alias in root.findAllChildren('alias', allDull=True): try: source = alias.dom.attributes['source'].nodeValue except (KeyError, AttributeError): @@ -261,7 +286,7 @@ class LocaleScanner (object): ids = root.findUniqueChild('identity') for code in ('language', 'script', 'territory', 'variant'): - for node in ids.findAllChildren(code): + for node in ids.findAllChildren(code, allDull=True): try: yield node.dom.attributes['type'].nodeValue except (KeyError, AttributeError): @@ -322,8 +347,8 @@ class LocaleScanner (object): yield 'plus', plus yield 'minus', minus - # Currency formatting (currencyFormat may have a type field): - xpath = 'numbers/currencyFormats/currencyFormatLength/currencyFormat/pattern' + # Currency formatting: + xpath = 'numbers/currencyFormats/currencyFormatLength/currencyFormat[standard]/pattern' try: money = self.find(xpath.replace('Formats/', 'Formats[numberSystem={}]/'.format(system))) |