diff options
author | Qt Forward Merge Bot <qt_forward_merge_bot@qt-project.org> | 2020-04-07 01:00:12 +0200 |
---|---|---|
committer | Fabian Kosmale <fabian.kosmale@qt.io> | 2020-04-08 20:11:39 +0200 |
commit | 8823bb8d306d78dd6a2e121a708dc607beff58c8 (patch) | |
tree | 5ca170aa36aa1381b0f31dae6709fd2ce68be344 /util/locale_database/cldr.py | |
parent | 5422fb79486a1818d6355d75f019fe63120a43d0 (diff) | |
parent | 14c55e29794b4f1d6e010fdf7082ef55cbf8f275 (diff) |
Merge remote-tracking branch 'origin/5.15' into dev
Conflicts:
examples/opengl/doc/src/cube.qdoc
src/corelib/global/qlibraryinfo.cpp
src/corelib/text/qbytearray_p.h
src/corelib/text/qlocale_data_p.h
src/corelib/time/qhijricalendar_data_p.h
src/corelib/time/qjalalicalendar_data_p.h
src/corelib/time/qromancalendar_data_p.h
src/network/ssl/qsslcertificate.h
src/widgets/doc/src/graphicsview.qdoc
src/widgets/widgets/qcombobox.cpp
src/widgets/widgets/qcombobox.h
tests/auto/corelib/tools/qscopeguard/tst_qscopeguard.cpp
tests/auto/widgets/widgets/qcombobox/tst_qcombobox.cpp
tests/benchmarks/corelib/io/qdiriterator/qdiriterator.pro
tests/manual/diaglib/debugproxystyle.cpp
tests/manual/diaglib/qwidgetdump.cpp
tests/manual/diaglib/qwindowdump.cpp
tests/manual/diaglib/textdump.cpp
util/locale_database/cldr2qlocalexml.py
util/locale_database/qlocalexml.py
util/locale_database/qlocalexml2cpp.py
Resolution of util/locale_database/ are based on:
https://codereview.qt-project.org/c/qt/qtbase/+/294250
and src/corelib/{text,time}/*_data_p.h were then regenerated by
running those scripts.
Updated CMakeLists.txt in each of
tests/auto/corelib/serialization/qcborstreamreader/
tests/auto/corelib/serialization/qcborvalue/
tests/auto/gui/kernel/
and generated new ones in each of
tests/auto/gui/kernel/qaddpostroutine/
tests/auto/gui/kernel/qhighdpiscaling/
tests/libfuzzer/corelib/text/qregularexpression/optimize/
tests/libfuzzer/gui/painting/qcolorspace/fromiccprofile/
tests/libfuzzer/gui/text/qtextdocument/sethtml/
tests/libfuzzer/gui/text/qtextdocument/setmarkdown/
tests/libfuzzer/gui/text/qtextlayout/beginlayout/
by running util/cmake/pro2cmake.py on their changed .pro files.
Changed target name in
tests/auto/gui/kernel/qaction/qaction.pro
tests/auto/gui/kernel/qaction/qactiongroup.pro
tests/auto/gui/kernel/qshortcut/qshortcut.pro
to ensure unique target names for CMake
Changed tst_QComboBox::currentIndex to not test the
currentIndexChanged(QString), as that one does not exist in Qt 6
anymore.
Change-Id: I9a85705484855ae1dc874a81f49d27a50b0dcff7
Diffstat (limited to 'util/locale_database/cldr.py')
-rw-r--r-- | util/locale_database/cldr.py | 718 |
1 files changed, 718 insertions, 0 deletions
diff --git a/util/locale_database/cldr.py b/util/locale_database/cldr.py new file mode 100644 index 0000000000..4b54f50080 --- /dev/null +++ b/util/locale_database/cldr.py @@ -0,0 +1,718 @@ +############################################################################# +## +## Copyright (C) 2020 The Qt Company Ltd. +## Contact: https://www.qt.io/licensing/ +## +## This file is part of the test suite of the Qt Toolkit. +## +## $QT_BEGIN_LICENSE:GPL-EXCEPT$ +## Commercial License Usage +## Licensees holding valid commercial Qt licenses may use this file in +## accordance with the commercial license agreement provided with the +## Software or, alternatively, in accordance with the terms contained in +## a written agreement between you and The Qt Company. For licensing terms +## and conditions see https://www.qt.io/terms-conditions. For further +## information use the contact form at https://www.qt.io/contact-us. +## +## GNU General Public License Usage +## Alternatively, this file may be used under the terms of the GNU +## General Public License version 3 as published by the Free Software +## Foundation with exceptions as appearing in the file LICENSE.GPL3-EXCEPT +## included in the packaging of this file. Please review the following +## information to ensure the GNU General Public License requirements will +## be met: https://www.gnu.org/licenses/gpl-3.0.html. +## +## $QT_END_LICENSE$ +## +############################################################################# +"""Digesting the CLDR's data. + +Provides two classes: + CldrReader -- driver for reading CLDR data + CldrAccess -- used by the reader to access the tree of data files + +The former should normally be all you need to access. +See individual classes for further detail. +""" + +from xml.dom import minidom +from weakref import WeakValueDictionary as CacheDict +import os + +from ldml import Error, Node, XmlScanner, Supplement, LocaleScanner +from qlocalexml import Locale + +class CldrReader (object): + def __init__(self, root, grumble = lambda msg: None, whitter = lambda msg: None): + """Set up a reader object for reading CLDR data. + + Single parameter, root, is the file-system path to the root of + the unpacked CLDR archive; its common/ sub-directory should + contain dtd/, main/ and supplemental/ sub-directories. + + Optional second argument, grumble, is a callable that logs + warnings and complaints, e.g. sys.stderr.write would be a + suitable callable. The default is a no-op that ignores its + single argument. Optional third argument is similar, used for + less interesting output; pass sys.stderr.write for it for + verbose output.""" + self.root = CldrAccess(root) + self.whitter, self.grumble = whitter, grumble + + def likelySubTags(self): + """Generator for likely subtag information. + + Yields pairs (have, give) of 4-tuples; if what you have + matches the left member, giving the right member is probably + sensible. Each 4-tuple's entries are the full names of a + language, a script, a country (strictly territory) and a + variant (currently ignored).""" + skips = [] + for got, use in self.root.likelySubTags(): + try: + have = self.__parseTags(got) + give = self.__parseTags(use) + except Error as e: + if ((use.startswith(got) or got.startswith('und_')) + and e.message.startswith('Unknown ') and ' code ' in e.message): + skips.append(use) + else: + self.grumble('Skipping likelySubtag "{}" -> "{}" ({})\n'.format(got, use, e.message)) + continue + if all(code.startswith('Any') and code[3].isupper() for code in have[:-1]): + continue + + give = (give[0], + # Substitute according to http://www.unicode.org/reports/tr35/#Likely_Subtags + have[1] if give[1] == 'AnyScript' else give[1], + have[2] if give[2] == 'AnyCountry' else give[2], + give[3]) # AnyVariant similarly ? + + yield have, give + + if skips: + # TODO: look at LDML's reserved locale tag names; they + # show up a lot in this, and may be grounds for filtering + # more out. + pass # self.__wrapped(self.whitter, 'Skipping likelySubtags (for unknown codes): ', skips) + + def readLocales(self, calendars = ('gregorian',)): + locales = tuple(self.__allLocales(calendars)) + return dict(((k.language_id, k.script_id, k.country_id, k.variant_code), + k) for k in locales) + + def __allLocales(self, calendars): + def skip(locale, reason): + return 'Skipping defaultContent locale "{}" ({})\n'.format(locale, reason) + + for locale in self.root.defaultContentLocales: + try: + language, script, country, variant = self.__splitLocale(locale) + except ValueError: + self.whitter(skip(locale, 'only language tag')) + continue + + if not (script or country): + self.grumble(skip(locale, 'second tag is neither script nor territory')) + continue + + if not (language and country): + continue + + try: + yield self.__getLocaleData(self.root.locale(locale), calendars, + language, script, country, variant) + except Error as e: + self.grumble(skip(locale, e.message)) + + for locale in self.root.fileLocales: + try: + chain = self.root.locale(locale) + language, script, country, variant = chain.tagCodes() + assert language + # TODO: this skip should probably be based on likely + # sub-tags, instead of empty country: if locale has a + # likely-subtag expansion, that's what QLocale uses, + # and we'll be saving its data for the expanded locale + # anyway, so don't need to record it for itself. + # See also QLocaleXmlReader.loadLocaleMap's grumble. + if not country: + continue + yield self.__getLocaleData(chain, calendars, language, script, country, variant) + except Error as e: + self.grumble('Skipping file locale "{}" ({})\n'.format(locale, e.message)) + + import textwrap + @staticmethod + def __wrapped(writer, prefix, tokens, wrap = textwrap.wrap): + writer('\n'.join(wrap(prefix + ', '.join(tokens), + subsequent_indent=' ', width=80)) + '\n') + del textwrap + + def __parseTags(self, locale): + tags = self.__splitLocale(locale) + language = tags.next() + script = country = variant = '' + try: + script, country, variant = tags + except ValueError: + pass + return tuple(p[1] for p in self.root.codesToIdName(language, script, country, variant)) + + def __splitLocale(self, name): + """Generate (language, script, territory, variant) from a locale name + + Ignores any trailing fields (with a warning), leaves script (a + capitalised four-letter token), territory (either a number or + an all-uppercase token) or variant (upper case and digits) + empty if unspecified. Only generates one entry if name is a + single tag (i.e. contains no underscores). Always yields 1 or + 4 values, never 2 or 3.""" + tags = iter(name.split('_')) + yield tags.next() # Language + tag = tags.next() # may raise StopIteration + + # Script is always four letters, always capitalised: + if len(tag) == 4 and tag[0].isupper() and tag[1:].islower(): + yield tag + try: + tag = tags.next() + except StopIteration: + tag = '' + else: + yield '' + + # Territory is upper-case or numeric: + if tag and tag.isupper() or tag.isdigit(): + yield tag + try: + tag = tags.next() + except StopIteration: + tag = '' + else: + yield '' + + # Variant can be any mixture of upper-case and digits. + if tag and all(c.isupper() or c.isdigit() for c in tag): + yield tag + tag = '' + else: + yield '' + + # If nothing is left, StopIteration will avoid the warning: + if not tag: + tag = tags.next() + self.grumble('Ignoring unparsed cruft {} in {}\n'.format('_'.join(tag + tuple(tags)), name)) + + def __getLocaleData(self, scan, calendars, language, script, country, variant): + ids, names = zip(*self.root.codesToIdName(language, script, country, variant)) + assert ids[0] > 0 and ids[2] > 0, (language, script, country, variant) + locale = Locale( + language = names[0], language_code = language, language_id = ids[0], + script = names[1], script_code = script, script_id = ids[1], + country = names[2], country_code = country, country_id = ids[2], + variant_code = variant) + + firstDay, weStart, weEnd = self.root.weekData(country) + assert all(day in ('mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun') + for day in (firstDay, weStart, weEnd)) + + locale.update(firstDayOfWeek = firstDay, + weekendStart = weStart, + weekendEnd = weEnd) + + iso, digits, rounding = self.root.currencyData(country) + locale.update(currencyIsoCode = iso, + currencyDigits = int(digits), + currencyRounding = int(rounding)) + + locale.update(scan.currencyData(iso)) + locale.update(scan.numericData(self.root.numberSystem, self.whitter)) + locale.update(scan.textPatternData()) + locale.update(scan.endonyms(language, script, country, variant)) + locale.update(scan.unitData()) # byte, kB, MB, GB, ..., KiB, MiB, GiB, ... + locale.update(scan.calendarNames(calendars)) # Names of days and months + + return locale + +# Note: various caches assume this class is a singleton, so the +# "default" value for a parameter no caller should pass can serve as +# the cache. If a process were to instantiate this class with distinct +# roots, each cache would be filled by the first to need it ! +class CldrAccess (object): + def __init__(self, root): + """Set up a master object for accessing CLDR data. + + Single parameter, root, is the file-system path to the root of + the unpacked CLDR archive; its common/ sub-directory should + contain dtd/, main/ and supplemental/ sub-directories.""" + self.root = root + + def xml(self, *path): + """Load a single XML file and return its root element as an XmlScanner. + + The path is interpreted relative to self.root""" + return XmlScanner(Node(self.__xml(path))) + + def supplement(self, name): + """Loads supplemental data as a Supplement object. + + The name should be that of a file in common/supplemental/, without path. + """ + return Supplement(Node(self.__xml(('common', 'supplemental', name)))) + + def locale(self, name): + """Loads all data for a locale as a LocaleScanner object. + + The name should be a locale name; adding suffix '.xml' to it + should usually yield a file in common/main/. The returned + LocaleScanner object packages this file along with all those + from which it inherits; its methods know how to handle that + inheritance, where relevant.""" + return LocaleScanner(name, self.__localeRoots(name), self.__rootLocale) + + @property + def fileLocales(self, joinPath = os.path.join, listDirectory = os.listdir, + splitExtension = os.path.splitext): + """Generator for locale IDs seen in file-names. + + All *.xml other than root.xml in common/main/ are assumed to + identify locales.""" + for name in listDirectory(joinPath(self.root, 'common', 'main')): + stem, ext = splitExtension(name) + if ext == '.xml' and stem != 'root': + yield stem + + @property + def defaultContentLocales(self): + """Generator for the default content locales.""" + for name, attrs in self.supplement('supplementalMetadata.xml').find('metadata/defaultContent'): + try: + locales = attrs['locales'] + except KeyError: + pass + else: + for locale in locales.split(): + yield locale + + def likelySubTags(self): + for ignore, attrs in self.supplement('likelySubtags.xml').find('likelySubtags'): + yield attrs['from'], attrs['to'] + + def numberSystem(self, system): + """Get a description of a numbering system. + + Returns a mapping, with keys u'digits', u'type' and u'id'; the + value for this last is system. Raises KeyError for unknown + number system, ldml.Error on failure to load data.""" + try: + return self.__numberSystems[system] + except KeyError: + raise Error('Unsupported number system: {}'.format(system)) + + def weekData(self, country): + """Data on the weekly cycle. + + Returns a triple (W, S, E) of en's short names for week-days; + W is the first day of the week, S the start of the week-end + and E the end of the week-end. Where data for a country is + unavailable, the data for CLDR's territory 001 (The World) is + used.""" + try: + return self.__weekData[country] + except KeyError: + return self.__weekData['001'] + + def currencyData(self, country): + """Returns currency data for the given country code. + + Return value is a tuple (ISO4217 code, digit count, rounding + mode). If CLDR provides no data for this country, ('', 2, 1) + is the default result. + """ + try: + return self.__currencyData[country] + except KeyError: + return '', 2, 1 + + def codesToIdName(self, language, script, country, variant = ''): + """Maps each code to the appropriate ID and name. + + Returns a 4-tuple of (ID, name) pairs corresponding to the + language, script, country and variant given. Raises a + suitable error if any of them is unknown, indicating all that + are unknown plus suitable names for any that could sensibly be + added to enumdata.py to make them known. + + Until we implement variant support (QTBUG-81051), the fourth + member of the returned tuple is always 0 paired with a string + that should not be used.""" + enum = self.__enumMap + try: + return (enum('language')[language], + enum('script')[script], + enum('country')[country], + enum('variant')[variant]) + except KeyError: + pass + + parts, values = [], [language, script, country, variant] + for index, key in enumerate(('language', 'script', 'country', 'variant')): + naming, enums = self.__codeMap(key), enum(key) + value = values[index] + if value not in enums: + text = '{} code {}'.format(key, value) + name = naming.get(value) + if name and value != 'POSIX': + text += u' (could add {})'.format(name) + parts.append(text) + if len(parts) > 1: + parts[-1] = 'and ' + parts[-1] + assert parts + raise Error('Unknown ' + ', '.join(parts), + language, script, country, variant) + + def readWindowsTimeZones(self, lookup): # For use by cldr2qtimezone.py + """Digest CLDR's MS-Win time-zone name mapping. + + MS-Win have their own eccentric names for time-zones. CLDR + helpfully provides a translation to more orthodox names. + + Singe argument, lookup, is a mapping from known MS-Win names + for locales to a unique integer index (starting at 1). + + The XML structure we read has the form: + + <supplementalData> + <windowsZones> + <mapTimezones otherVersion="..." typeVersion="..."> + <!-- (UTC-08:00) Pacific Time (US & Canada) --> + <mapZone other="Pacific Standard Time" territory="001" type="America/Los_Angeles"/> + <mapZone other="Pacific Standard Time" territory="CA" type="America/Vancouver America/Dawson America/Whitehorse"/> + <mapZone other="Pacific Standard Time" territory="US" type="America/Los_Angeles America/Metlakatla"/> + <mapZone other="Pacific Standard Time" territory="ZZ" type="PST8PDT"/> + </mapTimezones> + </windowsZones> + </supplementalData> +""" + zones = self.supplement('windowsZones.xml') + enum = self.__enumMap('country') + badZones, unLands, defaults, windows = set(), set(), {}, {} + + for name, attrs in zones.find('windowsZones/mapTimezones'): + if name != 'mapZone': + continue + + wid, code = attrs['other'], attrs['territory'] + data = dict(windowsId = wid, + countryCode = code, + ianaList = attrs['type']) + + try: + key = lookup[wid] + except KeyError: + badZones.add(wid) + key = 0 + data['windowsKey'] = key + + if code == u'001': + defaults[key] = data['ianaList'] + else: + try: + cid, name = enum[code] + except KeyError: + unLands.append(code) + continue + data.update(countryId = cid, country = name) + windows[key, cid] = data + + if unLands: + raise Error('Unknown country codes, please add to enumdata.py: ' + + ', '.join(sorted(unLands))) + + if badZones: + raise Error('Unknown Windows IDs, please add to cldr2qtimezone.py: ' + + ', '.join(sorted(badZones))) + + return self.cldrVersion, defaults, windows + + @property + def cldrVersion(self): + # Evaluate so as to ensure __cldrVersion is set: + self.__unDistinguishedAttributes + return self.__cldrVersion + + # Implementation details + def __xml(self, path, cache = CacheDict(), read = minidom.parse, joinPath = os.path.join): + try: + doc = cache[path] + except KeyError: + cache[path] = doc = read(joinPath(self.root, *path)).documentElement + return doc + + def __open(self, path, joinPath=os.path.join): + return open(joinPath(self.root, *path)) + + @property + def __rootLocale(self, cache = []): + if not cache: + cache.append(self.xml('common', 'main', 'root.xml')) + return cache[0] + + @property + def __supplementalData(self, cache = []): + if not cache: + cache.append(self.supplement('supplementalData.xml')) + return cache[0] + + @property + def __numberSystems(self, cache = {}, joinPath=os.path.join): + if not cache: + for ignore, attrs in self.supplement('numberingSystems.xml').find('numberingSystems'): + cache[attrs['id']] = attrs + assert cache + return cache + + @property + def __weekData(self, cache = {}): + if not cache: + firstDay, weStart, weEnd = self.__getWeekData() + # Massage those into an easily-consulted form: + # World defaults given for code '001': + mon, sat, sun = firstDay['001'], weStart['001'], weEnd['001'] + lands = set(firstDay) | set(weStart) | set(weEnd) + cache.update((land, + (firstDay.get(land, mon), weStart.get(land, sat), weEnd.get(land, sun))) + for land in lands) + assert cache + return cache + + def __getWeekData(self): + """Scan for data on the weekly cycle. + + Yields three mappings from locales to en's short names for + week-days; if a locale isn't a key of a given mapping, it + should use the '001' (world) locale's value. The first mapping + gives the day on which the week starts, the second gives the + day on which the week-end starts, the third gives the last day + of the week-end.""" + source = self.__supplementalData + for key in ('firstDay', 'weekendStart', 'weekendEnd'): + result = {} + for ignore, attrs in source.find('weekData/' + key): + assert ignore == key + day = attrs['day'] + assert day in ('mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun'), day + if 'alt' in attrs: + continue + for loc in attrs.get('territories', '').split(): + result[loc] = day + yield result + + @property + def __currencyData(self, cache = {}): + if not cache: + source = self.__supplementalData + for elt in source.findNodes('currencyData/region'): + iso, digits, rounding = '', 2, 1 + try: + country = elt.dom.attributes['iso3166'].nodeValue + except KeyError: + continue + for child in elt.findAllChildren('currency'): + try: + if child.dom.attributes['tender'].nodeValue == 'false': + continue + except KeyError: + pass + try: + child.dom.attributes['to'] # Is set if this element has gone out of date. + except KeyError: + iso = child.dom.attributes['iso4217'].nodeValue + break + if iso: + for tag, data in source.find( + 'currencyData/fractions/info[iso4217={}]'.format(iso)): + digits = data['digits'] + rounding = data['rounding'] + cache[country] = iso, digits, rounding + assert cache + + return cache + + @property + def __unDistinguishedAttributes(self, cache = {}, joinPath = os.path.join): + """Mapping from tag names to lists of attributes. + + LDML defines some attributes as 'distinguishing': if a node + has distinguishing attributes that weren't specified in an + XPath, a search on that XPath should exclude the node's + children. + + This property is a mapping from tag names to tuples of + attribute names that *aren't* distinguishing for that tag. + Its value is cached (so its costly computation isonly done + once) and there's a side-effect of populating its cache: it + sets self.__cldrVersion to the value found in ldml.dtd, during + parsing.""" + if not cache: + cache.update(self.__scanLdmlDtd()) + assert cache + + return cache + + def __scanLdmlDtd(self, joinPath = os.path.join): + """Scan the LDML DTD, record CLDR version + + Yields (tag, attrs) pairs: on elements with a given tag, + attributes named in its attrs (a tuple) may be ignored in an + XPath search; other attributes are distinguished attributes, + in the terminology of LDML's locale-inheritance rules. + + Sets self.__cldrVersion as a side-effect, since this + information is found in the same file.""" + with self.__open(('common', 'dtd', 'ldml.dtd')) as dtd: + tag, ignored, last = None, None, None + + for line in dtd: + if line.startswith('<!ELEMENT '): + if ignored: + assert tag + yield tag, tuple(ignored) + tag, ignored, last = line.split()[1], [], None + continue + + if line.startswith('<!ATTLIST '): + assert tag is not None + parts = line.split() + assert parts[1] == tag + last = parts[2] + if parts[1:5] == ['version', 'cldrVersion', 'CDATA', '#FIXED']: + # parts[5] is the version, in quotes, although the final > might be stuck on its end: + self.__cldrVersion = parts[5].split('"')[1] + continue + + # <!ELEMENT...>s can also be @METADATA, but not @VALUE: + if '<!--@VALUE-->' in line or (last and '<!--@METADATA-->' in line): + assert last is not None + assert ignored is not None + assert tag is not None + ignored.append(last) + last = None # No attribute is both value and metadata + + if tag and ignored: + yield tag, tuple(ignored) + + def __enumMap(self, key, cache = {}): + if not cache: + cache['variant'] = {'': (0, 'This should never be seen outside ldml.py')} + # They're not actually lists: mappings from numeric value + # to pairs of full name and short code. What we want, in + # each case, is a mapping from code to the other two. + from enumdata import language_list, script_list, country_list + for form, book, empty in (('language', language_list, 'AnyLanguage'), + ('script', script_list, 'AnyScript'), + ('country', country_list, 'AnyCountry')): + cache[form] = dict((pair[1], (num, pair[0])) + for num, pair in book.items() if pair[0] != 'C') + # (Have to filter out the C locale, as we give it the + # same (all space) code as AnyLanguage, whose code + # should probably be 'und' instead.) + + # Map empty to zero and the any value: + cache[form][''] = (0, empty) + # and map language code 'und' also to (0, any): + cache['language']['und'] = (0, 'AnyLanguage') + + return cache[key] + + def __codeMap(self, key, cache = {}, + # Maps our name for it to CLDR's name: + naming = {'language': 'languages', 'script': 'scripts', + 'country': 'territories', 'variant': 'variants'}): + if not cache: + root = self.xml('common', 'main', 'en.xml').root.findUniqueChild('localeDisplayNames') + for dst, src in naming.items(): + cache[dst] = dict(self.__codeMapScan(root.findUniqueChild(src))) + assert cache + + return cache[key] + + def __codeMapScan(self, node): + """Get mapping from codes to element values. + + Passed in node is a <languages>, <scripts>, <territories> or + <variants> node, each child of which is a <language>, + <script>, <territory> or <variant> node as appropriate, whose + type is a code (of the appropriate flavour) and content is its + full name. In some cases, two child nodes have the same type; + in these cases, one always has an alt attribute and we should + prefer the other. Yields all such type, content pairs found + in node's children (skipping any with an alt attribute, if + their type has been seen previously).""" + seen = set() + for elt in node.dom.childNodes: + try: + key, value = elt.attributes['type'].nodeValue, elt.childNodes[0].wholeText + except (KeyError, ValueError, TypeError): + pass + else: + if key not in seen or not elt.attributes.has_key('alt'): + yield key, value + seen.add(key) + + # CLDR uses inheritance between locales to save repetition: + def __parentLocale(self, name, cache = {}): + # see http://www.unicode.org/reports/tr35/#Parent_Locales + if not cache: + for tag, attrs in self.__supplementalData.find('parentLocales'): + parent = attrs.get('parent', '') + for child in attrs['locales'].split(): + cache[child] = parent + assert cache + + return cache[name] + + def __localeAsDoc(self, name, aliasFor = None, + joinPath = os.path.join, exists = os.path.isfile): + path = ('common', 'main', name + '.xml') + if exists(joinPath(self.root, *path)): + elt = self.__xml(path) + for child in Node(elt).findAllChildren('alias'): + try: + alias = child.dom.attributes['source'].nodeValue + except (KeyError, AttributeError): + pass + else: + return self.__localeAsDoc(alias, aliasFor or name) + # No alias child with a source: + return elt + + if aliasFor: + raise Error('Fatal error: found an alias "{}" -> "{}", but found no file for the alias' + .format(aliasFor, name)) + + def __scanLocaleRoots(self, name): + while name and name != 'root': + doc = self.__localeAsDoc(name) + if doc is not None: + yield Node(doc, self.__unDistinguishedAttributes) + + try: + name = self.__parentLocale(name) + except KeyError: + try: + name, tail = name.rsplit('_', 1) + except ValueError: # No tail to discard: we're done + break + + class __Seq (list): pass # No weakref for tuple and list, but list sub-class is ok. + def __localeRoots(self, name, cache = CacheDict()): + try: + chain = cache[name] + except KeyError: + cache[name] = chain = self.__Seq(self.__scanLocaleRoots(name)) + return chain + +# Unpolute the namespace: we don't need to export these. +del minidom, CacheDict, os |