diff options
author | Edward Welbourne <edward.welbourne@qt.io> | 2020-02-12 11:15:27 +0100 |
---|---|---|
committer | Edward Welbourne <eddy@chaos.org.uk> | 2020-04-02 19:43:28 +0100 |
commit | 81cf23c7a75eeee5c10e108571fb4222552da5d3 (patch) | |
tree | ebcb3370de376c108c7789b1ace2d624ae6aa8cb /util/locale_database/cldr.py | |
parent | e5eb0aa4281016e7a2b3fd5c4381999c91438f5b (diff) |
Take CLDR's distinguished attributes into account
When doing XPATH searches, child nodes that have distinguished
attributes that were not asked for should be skipped. This is part of
the LDML spec and matters when resolving locale inheritance. Scan the
LDML DTD (previously only scanned for the CLDR version) to find which
attributes of which tags are ignorable - all others are distinguished
- and take the result into account when performing XPATH searches.
The XPath we were using for currency formats wasn't excluding
currencyFormatLength elements with type="short" and patterns specific
to thousands (and larger multiples); this is fixed by taking
distinguished attributes into account. However, the XPATH also wasn't
specifying the always distinguished attribute type="standard" that
was, in practice, used for nearly all locales that weren't (wrongly)
using short-forms for thousands; so type="standard" is now made
explicit, so as to minimize the diff.
This leaves only twenty-one locales with a negative currency formats.
A later commit shall switch to using accounting by default (it falls
back via an alias to standard, in any case), thereby restoring the two
mentioned below that were using it by accident, but the present change
gives the minimal diff here.
Thousands-specific formats replaced with sensible ones:
* zh_Hant_{HK,MO} (Traditional Mandarin, Hong Kong and Macau)
* eo_001 (Esperanto)
* fr_CA (Canadian French)
* ha_* (Hausa, when not written in Arabic)
* es_{GT,MX,US} (Spanish - Guatemala, Mexico, USA)
* sw_KE (Swahili, Kenya)
* yi_001 (Yiddish)
* mfe_MU (Morisyen, Mauritius)
* lag_TZ (Langi, Tanzania)
* mgh_MZ (Makhuwa Meetto, Mozambique)
* wae_CH (Walser, Switzerland)
* kkj_CM (Kako, Cameroon)
* lkt_US (Lakota, USA)
* pa_Arab_PK (Punjabi, in Arabic script, as used in Pakistan; uses
arabext number system, whose currency falls back to latn's, for
which pa_Arab over-rides the thousands-format).
Format changed from an over-ridden type="accounting" to standard (so
these lost a negative-specific form) in:
* en_SI (English, Slovenia)
* es_DO (Spanish, Dominican Republic; same)
For some locales we were picking up over-rides of narrow or short list
formats, or formats for or-lists or unit-lists rather than and-lists,
in place of the standard list format, that these locales don't
over-ride, provided by a parent locale. This changed list formats for:
* en_CA, en_IN (dropped "Oxford" comma before "and")
* qu_* (Quechua; dropped "utaq", presumably meaning "and")
* ur_IN (Urdu, India; was using unit-list formats)
[ChangeLog][QtCore][QLocale] Data used for currency formats in several
locales and list patterns in some locales have changed due to now
parsing the CLDR data more faithfully.
Fixes: QTBUG-81344
Change-Id: I6b95c6c37db92df167153767c1b103becfb0ac98
Reviewed-by: Cristian Maureira-Fredes <cristian.maureira-fredes@qt.io>
Diffstat (limited to 'util/locale_database/cldr.py')
-rw-r--r-- | util/locale_database/cldr.py | 63 |
1 files changed, 57 insertions, 6 deletions
diff --git a/util/locale_database/cldr.py b/util/locale_database/cldr.py index 94459b9e3f..0cc2560632 100644 --- a/util/locale_database/cldr.py +++ b/util/locale_database/cldr.py @@ -439,7 +439,7 @@ class CldrAccess (object): @property def cldrVersion(self): # Evaluate so as to ensure __cldrVersion is set: - self.__scanLdmlDtd() + self.__unDistinguishedAttributes return self.__cldrVersion # Implementation details @@ -545,17 +545,68 @@ class CldrAccess (object): return cache + @property + def __unDistinguishedAttributes(self, cache = {}, joinPath = os.path.join): + """Mapping from tag names to lists of attributes. + + LDML defines some attributes as 'distinguishing': if a node + has distinguishing attributes that weren't specified in an + XPath, a search on that XPath should exclude the node's + children. + + This property is a mapping from tag names to tuples of + attribute names that *aren't* distinguishing for that tag. + Its value is cached (so its costly computation isonly done + once) and there's a side-effect of populating its cache: it + sets self.__cldrVersion to the value found in ldml.dtd, during + parsing.""" + if not cache: + cache.update(self.__scanLdmlDtd()) + assert cache + + return cache + def __scanLdmlDtd(self, joinPath = os.path.join): - """Scan the LDML DTD, record CLDR version.""" + """Scan the LDML DTD, record CLDR version + + Yields (tag, attrs) pairs: on elements with a given tag, + attributes named in its attrs (a tuple) may be ignored in an + XPath search; other attributes are distinguished attributes, + in the terminology of LDML's locale-inheritance rules. + + Sets self.__cldrVersion as a side-effect, since this + information is found in the same file.""" with self.__open(('common', 'dtd', 'ldml.dtd')) as dtd: + tag, ignored, last = None, None, None + for line in dtd: + if line.startswith('<!ELEMENT '): + if ignored: + assert tag + yield tag, tuple(ignored) + tag, ignored, last = line.split()[1], [], None + continue + if line.startswith('<!ATTLIST '): + assert tag is not None parts = line.split() + assert parts[1] == tag + last = parts[2] if parts[1:5] == ['version', 'cldrVersion', 'CDATA', '#FIXED']: - # parts[5] is the version, in quotes, maybe - # with a final > attached to its end: + # parts[5] is the version, in quotes, although the final > might be stuck on its end: self.__cldrVersion = parts[5].split('"')[1] - break + continue + + # <!ELEMENT...>s can also be @METADATA, but not @VALUE: + if '<!--@VALUE-->' in line or (last and '<!--@METADATA-->' in line): + assert last is not None + assert ignored is not None + assert tag is not None + ignored.append(last) + last = None # No attribute is both value and metadata + + if tag and ignored: + yield tag, tuple(ignored) def __enumMap(self, key, cache = {}): if not cache: @@ -650,7 +701,7 @@ class CldrAccess (object): while name and name != 'root': doc = self.__localeAsDoc(name) if doc is not None: - yield Node(doc) + yield Node(doc, self.__unDistinguishedAttributes) try: name = self.__parentLocale(name) |