summaryrefslogtreecommitdiffstats
path: root/util
diff options
context:
space:
mode:
authorQt Forward Merge Bot <qt_forward_merge_bot@qt-project.org>2020-04-07 01:00:12 +0200
committerFabian Kosmale <fabian.kosmale@qt.io>2020-04-08 22:04:23 +0200
commitc937ed8af4f3dfef3fd8f8c2a9815376790dd5bf (patch)
tree5175aff87e160ae8f32dadc60d3cfd38b73d4fb1 /util
parente0346df1b21cb30b54ae8d4918addc9925fa8479 (diff)
parent8823bb8d306d78dd6a2e121a708dc607beff58c8 (diff)
Merge "Merge remote-tracking branch 'origin/5.15' into dev"
Diffstat (limited to 'util')
-rw-r--r--util/locale_database/cldr.py718
-rwxr-xr-xutil/locale_database/cldr2qlocalexml.py705
-rwxr-xr-xutil/locale_database/cldr2qtimezone.py369
-rw-r--r--util/locale_database/ldml.py589
-rw-r--r--util/locale_database/localetools.py164
-rw-r--r--util/locale_database/qlocalexml.py368
-rwxr-xr-xutil/locale_database/qlocalexml2cpp.py1252
-rw-r--r--util/locale_database/xpathlite.py288
8 files changed, 2511 insertions, 1942 deletions
diff --git a/util/locale_database/cldr.py b/util/locale_database/cldr.py
new file mode 100644
index 0000000000..4b54f50080
--- /dev/null
+++ b/util/locale_database/cldr.py
@@ -0,0 +1,718 @@
+#############################################################################
+##
+## Copyright (C) 2020 The Qt Company Ltd.
+## Contact: https://www.qt.io/licensing/
+##
+## This file is part of the test suite of the Qt Toolkit.
+##
+## $QT_BEGIN_LICENSE:GPL-EXCEPT$
+## Commercial License Usage
+## Licensees holding valid commercial Qt licenses may use this file in
+## accordance with the commercial license agreement provided with the
+## Software or, alternatively, in accordance with the terms contained in
+## a written agreement between you and The Qt Company. For licensing terms
+## and conditions see https://www.qt.io/terms-conditions. For further
+## information use the contact form at https://www.qt.io/contact-us.
+##
+## GNU General Public License Usage
+## Alternatively, this file may be used under the terms of the GNU
+## General Public License version 3 as published by the Free Software
+## Foundation with exceptions as appearing in the file LICENSE.GPL3-EXCEPT
+## included in the packaging of this file. Please review the following
+## information to ensure the GNU General Public License requirements will
+## be met: https://www.gnu.org/licenses/gpl-3.0.html.
+##
+## $QT_END_LICENSE$
+##
+#############################################################################
+"""Digesting the CLDR's data.
+
+Provides two classes:
+ CldrReader -- driver for reading CLDR data
+ CldrAccess -- used by the reader to access the tree of data files
+
+The former should normally be all you need to access.
+See individual classes for further detail.
+"""
+
+from xml.dom import minidom
+from weakref import WeakValueDictionary as CacheDict
+import os
+
+from ldml import Error, Node, XmlScanner, Supplement, LocaleScanner
+from qlocalexml import Locale
+
+class CldrReader (object):
+ def __init__(self, root, grumble = lambda msg: None, whitter = lambda msg: None):
+ """Set up a reader object for reading CLDR data.
+
+ Single parameter, root, is the file-system path to the root of
+ the unpacked CLDR archive; its common/ sub-directory should
+ contain dtd/, main/ and supplemental/ sub-directories.
+
+ Optional second argument, grumble, is a callable that logs
+ warnings and complaints, e.g. sys.stderr.write would be a
+ suitable callable. The default is a no-op that ignores its
+ single argument. Optional third argument is similar, used for
+ less interesting output; pass sys.stderr.write for it for
+ verbose output."""
+ self.root = CldrAccess(root)
+ self.whitter, self.grumble = whitter, grumble
+
+ def likelySubTags(self):
+ """Generator for likely subtag information.
+
+ Yields pairs (have, give) of 4-tuples; if what you have
+ matches the left member, giving the right member is probably
+ sensible. Each 4-tuple's entries are the full names of a
+ language, a script, a country (strictly territory) and a
+ variant (currently ignored)."""
+ skips = []
+ for got, use in self.root.likelySubTags():
+ try:
+ have = self.__parseTags(got)
+ give = self.__parseTags(use)
+ except Error as e:
+ if ((use.startswith(got) or got.startswith('und_'))
+ and e.message.startswith('Unknown ') and ' code ' in e.message):
+ skips.append(use)
+ else:
+ self.grumble('Skipping likelySubtag "{}" -> "{}" ({})\n'.format(got, use, e.message))
+ continue
+ if all(code.startswith('Any') and code[3].isupper() for code in have[:-1]):
+ continue
+
+ give = (give[0],
+ # Substitute according to http://www.unicode.org/reports/tr35/#Likely_Subtags
+ have[1] if give[1] == 'AnyScript' else give[1],
+ have[2] if give[2] == 'AnyCountry' else give[2],
+ give[3]) # AnyVariant similarly ?
+
+ yield have, give
+
+ if skips:
+ # TODO: look at LDML's reserved locale tag names; they
+ # show up a lot in this, and may be grounds for filtering
+ # more out.
+ pass # self.__wrapped(self.whitter, 'Skipping likelySubtags (for unknown codes): ', skips)
+
+ def readLocales(self, calendars = ('gregorian',)):
+ locales = tuple(self.__allLocales(calendars))
+ return dict(((k.language_id, k.script_id, k.country_id, k.variant_code),
+ k) for k in locales)
+
+ def __allLocales(self, calendars):
+ def skip(locale, reason):
+ return 'Skipping defaultContent locale "{}" ({})\n'.format(locale, reason)
+
+ for locale in self.root.defaultContentLocales:
+ try:
+ language, script, country, variant = self.__splitLocale(locale)
+ except ValueError:
+ self.whitter(skip(locale, 'only language tag'))
+ continue
+
+ if not (script or country):
+ self.grumble(skip(locale, 'second tag is neither script nor territory'))
+ continue
+
+ if not (language and country):
+ continue
+
+ try:
+ yield self.__getLocaleData(self.root.locale(locale), calendars,
+ language, script, country, variant)
+ except Error as e:
+ self.grumble(skip(locale, e.message))
+
+ for locale in self.root.fileLocales:
+ try:
+ chain = self.root.locale(locale)
+ language, script, country, variant = chain.tagCodes()
+ assert language
+ # TODO: this skip should probably be based on likely
+ # sub-tags, instead of empty country: if locale has a
+ # likely-subtag expansion, that's what QLocale uses,
+ # and we'll be saving its data for the expanded locale
+ # anyway, so don't need to record it for itself.
+ # See also QLocaleXmlReader.loadLocaleMap's grumble.
+ if not country:
+ continue
+ yield self.__getLocaleData(chain, calendars, language, script, country, variant)
+ except Error as e:
+ self.grumble('Skipping file locale "{}" ({})\n'.format(locale, e.message))
+
+ import textwrap
+ @staticmethod
+ def __wrapped(writer, prefix, tokens, wrap = textwrap.wrap):
+ writer('\n'.join(wrap(prefix + ', '.join(tokens),
+ subsequent_indent=' ', width=80)) + '\n')
+ del textwrap
+
+ def __parseTags(self, locale):
+ tags = self.__splitLocale(locale)
+ language = tags.next()
+ script = country = variant = ''
+ try:
+ script, country, variant = tags
+ except ValueError:
+ pass
+ return tuple(p[1] for p in self.root.codesToIdName(language, script, country, variant))
+
+ def __splitLocale(self, name):
+ """Generate (language, script, territory, variant) from a locale name
+
+ Ignores any trailing fields (with a warning), leaves script (a
+ capitalised four-letter token), territory (either a number or
+ an all-uppercase token) or variant (upper case and digits)
+ empty if unspecified. Only generates one entry if name is a
+ single tag (i.e. contains no underscores). Always yields 1 or
+ 4 values, never 2 or 3."""
+ tags = iter(name.split('_'))
+ yield tags.next() # Language
+ tag = tags.next() # may raise StopIteration
+
+ # Script is always four letters, always capitalised:
+ if len(tag) == 4 and tag[0].isupper() and tag[1:].islower():
+ yield tag
+ try:
+ tag = tags.next()
+ except StopIteration:
+ tag = ''
+ else:
+ yield ''
+
+ # Territory is upper-case or numeric:
+ if tag and tag.isupper() or tag.isdigit():
+ yield tag
+ try:
+ tag = tags.next()
+ except StopIteration:
+ tag = ''
+ else:
+ yield ''
+
+ # Variant can be any mixture of upper-case and digits.
+ if tag and all(c.isupper() or c.isdigit() for c in tag):
+ yield tag
+ tag = ''
+ else:
+ yield ''
+
+ # If nothing is left, StopIteration will avoid the warning:
+ if not tag:
+ tag = tags.next()
+ self.grumble('Ignoring unparsed cruft {} in {}\n'.format('_'.join(tag + tuple(tags)), name))
+
+ def __getLocaleData(self, scan, calendars, language, script, country, variant):
+ ids, names = zip(*self.root.codesToIdName(language, script, country, variant))
+ assert ids[0] > 0 and ids[2] > 0, (language, script, country, variant)
+ locale = Locale(
+ language = names[0], language_code = language, language_id = ids[0],
+ script = names[1], script_code = script, script_id = ids[1],
+ country = names[2], country_code = country, country_id = ids[2],
+ variant_code = variant)
+
+ firstDay, weStart, weEnd = self.root.weekData(country)
+ assert all(day in ('mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun')
+ for day in (firstDay, weStart, weEnd))
+
+ locale.update(firstDayOfWeek = firstDay,
+ weekendStart = weStart,
+ weekendEnd = weEnd)
+
+ iso, digits, rounding = self.root.currencyData(country)
+ locale.update(currencyIsoCode = iso,
+ currencyDigits = int(digits),
+ currencyRounding = int(rounding))
+
+ locale.update(scan.currencyData(iso))
+ locale.update(scan.numericData(self.root.numberSystem, self.whitter))
+ locale.update(scan.textPatternData())
+ locale.update(scan.endonyms(language, script, country, variant))
+ locale.update(scan.unitData()) # byte, kB, MB, GB, ..., KiB, MiB, GiB, ...
+ locale.update(scan.calendarNames(calendars)) # Names of days and months
+
+ return locale
+
+# Note: various caches assume this class is a singleton, so the
+# "default" value for a parameter no caller should pass can serve as
+# the cache. If a process were to instantiate this class with distinct
+# roots, each cache would be filled by the first to need it !
+class CldrAccess (object):
+ def __init__(self, root):
+ """Set up a master object for accessing CLDR data.
+
+ Single parameter, root, is the file-system path to the root of
+ the unpacked CLDR archive; its common/ sub-directory should
+ contain dtd/, main/ and supplemental/ sub-directories."""
+ self.root = root
+
+ def xml(self, *path):
+ """Load a single XML file and return its root element as an XmlScanner.
+
+ The path is interpreted relative to self.root"""
+ return XmlScanner(Node(self.__xml(path)))
+
+ def supplement(self, name):
+ """Loads supplemental data as a Supplement object.
+
+ The name should be that of a file in common/supplemental/, without path.
+ """
+ return Supplement(Node(self.__xml(('common', 'supplemental', name))))
+
+ def locale(self, name):
+ """Loads all data for a locale as a LocaleScanner object.
+
+ The name should be a locale name; adding suffix '.xml' to it
+ should usually yield a file in common/main/. The returned
+ LocaleScanner object packages this file along with all those
+ from which it inherits; its methods know how to handle that
+ inheritance, where relevant."""
+ return LocaleScanner(name, self.__localeRoots(name), self.__rootLocale)
+
+ @property
+ def fileLocales(self, joinPath = os.path.join, listDirectory = os.listdir,
+ splitExtension = os.path.splitext):
+ """Generator for locale IDs seen in file-names.
+
+ All *.xml other than root.xml in common/main/ are assumed to
+ identify locales."""
+ for name in listDirectory(joinPath(self.root, 'common', 'main')):
+ stem, ext = splitExtension(name)
+ if ext == '.xml' and stem != 'root':
+ yield stem
+
+ @property
+ def defaultContentLocales(self):
+ """Generator for the default content locales."""
+ for name, attrs in self.supplement('supplementalMetadata.xml').find('metadata/defaultContent'):
+ try:
+ locales = attrs['locales']
+ except KeyError:
+ pass
+ else:
+ for locale in locales.split():
+ yield locale
+
+ def likelySubTags(self):
+ for ignore, attrs in self.supplement('likelySubtags.xml').find('likelySubtags'):
+ yield attrs['from'], attrs['to']
+
+ def numberSystem(self, system):
+ """Get a description of a numbering system.
+
+ Returns a mapping, with keys u'digits', u'type' and u'id'; the
+ value for this last is system. Raises KeyError for unknown
+ number system, ldml.Error on failure to load data."""
+ try:
+ return self.__numberSystems[system]
+ except KeyError:
+ raise Error('Unsupported number system: {}'.format(system))
+
+ def weekData(self, country):
+ """Data on the weekly cycle.
+
+ Returns a triple (W, S, E) of en's short names for week-days;
+ W is the first day of the week, S the start of the week-end
+ and E the end of the week-end. Where data for a country is
+ unavailable, the data for CLDR's territory 001 (The World) is
+ used."""
+ try:
+ return self.__weekData[country]
+ except KeyError:
+ return self.__weekData['001']
+
+ def currencyData(self, country):
+ """Returns currency data for the given country code.
+
+ Return value is a tuple (ISO4217 code, digit count, rounding
+ mode). If CLDR provides no data for this country, ('', 2, 1)
+ is the default result.
+ """
+ try:
+ return self.__currencyData[country]
+ except KeyError:
+ return '', 2, 1
+
+ def codesToIdName(self, language, script, country, variant = ''):
+ """Maps each code to the appropriate ID and name.
+
+ Returns a 4-tuple of (ID, name) pairs corresponding to the
+ language, script, country and variant given. Raises a
+ suitable error if any of them is unknown, indicating all that
+ are unknown plus suitable names for any that could sensibly be
+ added to enumdata.py to make them known.
+
+ Until we implement variant support (QTBUG-81051), the fourth
+ member of the returned tuple is always 0 paired with a string
+ that should not be used."""
+ enum = self.__enumMap
+ try:
+ return (enum('language')[language],
+ enum('script')[script],
+ enum('country')[country],
+ enum('variant')[variant])
+ except KeyError:
+ pass
+
+ parts, values = [], [language, script, country, variant]
+ for index, key in enumerate(('language', 'script', 'country', 'variant')):
+ naming, enums = self.__codeMap(key), enum(key)
+ value = values[index]
+ if value not in enums:
+ text = '{} code {}'.format(key, value)
+ name = naming.get(value)
+ if name and value != 'POSIX':
+ text += u' (could add {})'.format(name)
+ parts.append(text)
+ if len(parts) > 1:
+ parts[-1] = 'and ' + parts[-1]
+ assert parts
+ raise Error('Unknown ' + ', '.join(parts),
+ language, script, country, variant)
+
+ def readWindowsTimeZones(self, lookup): # For use by cldr2qtimezone.py
+ """Digest CLDR's MS-Win time-zone name mapping.
+
+ MS-Win have their own eccentric names for time-zones. CLDR
+ helpfully provides a translation to more orthodox names.
+
+ Singe argument, lookup, is a mapping from known MS-Win names
+ for locales to a unique integer index (starting at 1).
+
+ The XML structure we read has the form:
+
+ <supplementalData>
+ <windowsZones>
+ <mapTimezones otherVersion="..." typeVersion="...">
+ <!-- (UTC-08:00) Pacific Time (US & Canada) -->
+ <mapZone other="Pacific Standard Time" territory="001" type="America/Los_Angeles"/>
+ <mapZone other="Pacific Standard Time" territory="CA" type="America/Vancouver America/Dawson America/Whitehorse"/>
+ <mapZone other="Pacific Standard Time" territory="US" type="America/Los_Angeles America/Metlakatla"/>
+ <mapZone other="Pacific Standard Time" territory="ZZ" type="PST8PDT"/>
+ </mapTimezones>
+ </windowsZones>
+ </supplementalData>
+"""
+ zones = self.supplement('windowsZones.xml')
+ enum = self.__enumMap('country')
+ badZones, unLands, defaults, windows = set(), set(), {}, {}
+
+ for name, attrs in zones.find('windowsZones/mapTimezones'):
+ if name != 'mapZone':
+ continue
+
+ wid, code = attrs['other'], attrs['territory']
+ data = dict(windowsId = wid,
+ countryCode = code,
+ ianaList = attrs['type'])
+
+ try:
+ key = lookup[wid]
+ except KeyError:
+ badZones.add(wid)
+ key = 0
+ data['windowsKey'] = key
+
+ if code == u'001':
+ defaults[key] = data['ianaList']
+ else:
+ try:
+ cid, name = enum[code]
+ except KeyError:
+ unLands.append(code)
+ continue
+ data.update(countryId = cid, country = name)
+ windows[key, cid] = data
+
+ if unLands:
+ raise Error('Unknown country codes, please add to enumdata.py: '
+ + ', '.join(sorted(unLands)))
+
+ if badZones:
+ raise Error('Unknown Windows IDs, please add to cldr2qtimezone.py: '
+ + ', '.join(sorted(badZones)))
+
+ return self.cldrVersion, defaults, windows
+
+ @property
+ def cldrVersion(self):
+ # Evaluate so as to ensure __cldrVersion is set:
+ self.__unDistinguishedAttributes
+ return self.__cldrVersion
+
+ # Implementation details
+ def __xml(self, path, cache = CacheDict(), read = minidom.parse, joinPath = os.path.join):
+ try:
+ doc = cache[path]
+ except KeyError:
+ cache[path] = doc = read(joinPath(self.root, *path)).documentElement
+ return doc
+
+ def __open(self, path, joinPath=os.path.join):
+ return open(joinPath(self.root, *path))
+
+ @property
+ def __rootLocale(self, cache = []):
+ if not cache:
+ cache.append(self.xml('common', 'main', 'root.xml'))
+ return cache[0]
+
+ @property
+ def __supplementalData(self, cache = []):
+ if not cache:
+ cache.append(self.supplement('supplementalData.xml'))
+ return cache[0]
+
+ @property
+ def __numberSystems(self, cache = {}, joinPath=os.path.join):
+ if not cache:
+ for ignore, attrs in self.supplement('numberingSystems.xml').find('numberingSystems'):
+ cache[attrs['id']] = attrs
+ assert cache
+ return cache
+
+ @property
+ def __weekData(self, cache = {}):
+ if not cache:
+ firstDay, weStart, weEnd = self.__getWeekData()
+ # Massage those into an easily-consulted form:
+ # World defaults given for code '001':
+ mon, sat, sun = firstDay['001'], weStart['001'], weEnd['001']
+ lands = set(firstDay) | set(weStart) | set(weEnd)
+ cache.update((land,
+ (firstDay.get(land, mon), weStart.get(land, sat), weEnd.get(land, sun)))
+ for land in lands)
+ assert cache
+ return cache
+
+ def __getWeekData(self):
+ """Scan for data on the weekly cycle.
+
+ Yields three mappings from locales to en's short names for
+ week-days; if a locale isn't a key of a given mapping, it
+ should use the '001' (world) locale's value. The first mapping
+ gives the day on which the week starts, the second gives the
+ day on which the week-end starts, the third gives the last day
+ of the week-end."""
+ source = self.__supplementalData
+ for key in ('firstDay', 'weekendStart', 'weekendEnd'):
+ result = {}
+ for ignore, attrs in source.find('weekData/' + key):
+ assert ignore == key
+ day = attrs['day']
+ assert day in ('mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun'), day
+ if 'alt' in attrs:
+ continue
+ for loc in attrs.get('territories', '').split():
+ result[loc] = day
+ yield result
+
+ @property
+ def __currencyData(self, cache = {}):
+ if not cache:
+ source = self.__supplementalData
+ for elt in source.findNodes('currencyData/region'):
+ iso, digits, rounding = '', 2, 1
+ try:
+ country = elt.dom.attributes['iso3166'].nodeValue
+ except KeyError:
+ continue
+ for child in elt.findAllChildren('currency'):
+ try:
+ if child.dom.attributes['tender'].nodeValue == 'false':
+ continue
+ except KeyError:
+ pass
+ try:
+ child.dom.attributes['to'] # Is set if this element has gone out of date.
+ except KeyError:
+ iso = child.dom.attributes['iso4217'].nodeValue
+ break
+ if iso:
+ for tag, data in source.find(
+ 'currencyData/fractions/info[iso4217={}]'.format(iso)):
+ digits = data['digits']
+ rounding = data['rounding']
+ cache[country] = iso, digits, rounding
+ assert cache
+
+ return cache
+
+ @property
+ def __unDistinguishedAttributes(self, cache = {}, joinPath = os.path.join):
+ """Mapping from tag names to lists of attributes.
+
+ LDML defines some attributes as 'distinguishing': if a node
+ has distinguishing attributes that weren't specified in an
+ XPath, a search on that XPath should exclude the node's
+ children.
+
+ This property is a mapping from tag names to tuples of
+ attribute names that *aren't* distinguishing for that tag.
+ Its value is cached (so its costly computation isonly done
+ once) and there's a side-effect of populating its cache: it
+ sets self.__cldrVersion to the value found in ldml.dtd, during
+ parsing."""
+ if not cache:
+ cache.update(self.__scanLdmlDtd())
+ assert cache
+
+ return cache
+
+ def __scanLdmlDtd(self, joinPath = os.path.join):
+ """Scan the LDML DTD, record CLDR version
+
+ Yields (tag, attrs) pairs: on elements with a given tag,
+ attributes named in its attrs (a tuple) may be ignored in an
+ XPath search; other attributes are distinguished attributes,
+ in the terminology of LDML's locale-inheritance rules.
+
+ Sets self.__cldrVersion as a side-effect, since this
+ information is found in the same file."""
+ with self.__open(('common', 'dtd', 'ldml.dtd')) as dtd:
+ tag, ignored, last = None, None, None
+
+ for line in dtd:
+ if line.startswith('<!ELEMENT '):
+ if ignored:
+ assert tag
+ yield tag, tuple(ignored)
+ tag, ignored, last = line.split()[1], [], None
+ continue
+
+ if line.startswith('<!ATTLIST '):
+ assert tag is not None
+ parts = line.split()
+ assert parts[1] == tag
+ last = parts[2]
+ if parts[1:5] == ['version', 'cldrVersion', 'CDATA', '#FIXED']:
+ # parts[5] is the version, in quotes, although the final > might be stuck on its end:
+ self.__cldrVersion = parts[5].split('"')[1]
+ continue
+
+ # <!ELEMENT...>s can also be @METADATA, but not @VALUE:
+ if '<!--@VALUE-->' in line or (last and '<!--@METADATA-->' in line):
+ assert last is not None
+ assert ignored is not None
+ assert tag is not None
+ ignored.append(last)
+ last = None # No attribute is both value and metadata
+
+ if tag and ignored:
+ yield tag, tuple(ignored)
+
+ def __enumMap(self, key, cache = {}):
+ if not cache:
+ cache['variant'] = {'': (0, 'This should never be seen outside ldml.py')}
+ # They're not actually lists: mappings from numeric value
+ # to pairs of full name and short code. What we want, in
+ # each case, is a mapping from code to the other two.
+ from enumdata import language_list, script_list, country_list
+ for form, book, empty in (('language', language_list, 'AnyLanguage'),
+ ('script', script_list, 'AnyScript'),
+ ('country', country_list, 'AnyCountry')):
+ cache[form] = dict((pair[1], (num, pair[0]))
+ for num, pair in book.items() if pair[0] != 'C')
+ # (Have to filter out the C locale, as we give it the
+ # same (all space) code as AnyLanguage, whose code
+ # should probably be 'und' instead.)
+
+ # Map empty to zero and the any value:
+ cache[form][''] = (0, empty)
+ # and map language code 'und' also to (0, any):
+ cache['language']['und'] = (0, 'AnyLanguage')
+
+ return cache[key]
+
+ def __codeMap(self, key, cache = {},
+ # Maps our name for it to CLDR's name:
+ naming = {'language': 'languages', 'script': 'scripts',
+ 'country': 'territories', 'variant': 'variants'}):
+ if not cache:
+ root = self.xml('common', 'main', 'en.xml').root.findUniqueChild('localeDisplayNames')
+ for dst, src in naming.items():
+ cache[dst] = dict(self.__codeMapScan(root.findUniqueChild(src)))
+ assert cache
+
+ return cache[key]
+
+ def __codeMapScan(self, node):
+ """Get mapping from codes to element values.
+
+ Passed in node is a <languages>, <scripts>, <territories> or
+ <variants> node, each child of which is a <language>,
+ <script>, <territory> or <variant> node as appropriate, whose
+ type is a code (of the appropriate flavour) and content is its
+ full name. In some cases, two child nodes have the same type;
+ in these cases, one always has an alt attribute and we should
+ prefer the other. Yields all such type, content pairs found
+ in node's children (skipping any with an alt attribute, if
+ their type has been seen previously)."""
+ seen = set()
+ for elt in node.dom.childNodes:
+ try:
+ key, value = elt.attributes['type'].nodeValue, elt.childNodes[0].wholeText
+ except (KeyError, ValueError, TypeError):
+ pass
+ else:
+ if key not in seen or not elt.attributes.has_key('alt'):
+ yield key, value
+ seen.add(key)
+
+ # CLDR uses inheritance between locales to save repetition:
+ def __parentLocale(self, name, cache = {}):
+ # see http://www.unicode.org/reports/tr35/#Parent_Locales
+ if not cache:
+ for tag, attrs in self.__supplementalData.find('parentLocales'):
+ parent = attrs.get('parent', '')
+ for child in attrs['locales'].split():
+ cache[child] = parent
+ assert cache
+
+ return cache[name]
+
+ def __localeAsDoc(self, name, aliasFor = None,
+ joinPath = os.path.join, exists = os.path.isfile):
+ path = ('common', 'main', name + '.xml')
+ if exists(joinPath(self.root, *path)):
+ elt = self.__xml(path)
+ for child in Node(elt).findAllChildren('alias'):
+ try:
+ alias = child.dom.attributes['source'].nodeValue
+ except (KeyError, AttributeError):
+ pass
+ else:
+ return self.__localeAsDoc(alias, aliasFor or name)
+ # No alias child with a source:
+ return elt
+
+ if aliasFor:
+ raise Error('Fatal error: found an alias "{}" -> "{}", but found no file for the alias'
+ .format(aliasFor, name))
+
+ def __scanLocaleRoots(self, name):
+ while name and name != 'root':
+ doc = self.__localeAsDoc(name)
+ if doc is not None:
+ yield Node(doc, self.__unDistinguishedAttributes)
+
+ try:
+ name = self.__parentLocale(name)
+ except KeyError:
+ try:
+ name, tail = name.rsplit('_', 1)
+ except ValueError: # No tail to discard: we're done
+ break
+
+ class __Seq (list): pass # No weakref for tuple and list, but list sub-class is ok.
+ def __localeRoots(self, name, cache = CacheDict()):
+ try:
+ chain = cache[name]
+ except KeyError:
+ cache[name] = chain = self.__Seq(self.__scanLocaleRoots(name))
+ return chain
+
+# Unpolute the namespace: we don't need to export these.
+del minidom, CacheDict, os
diff --git a/util/locale_database/cldr2qlocalexml.py b/util/locale_database/cldr2qlocalexml.py
index 7f98e29d47..c05cabf520 100755
--- a/util/locale_database/cldr2qlocalexml.py
+++ b/util/locale_database/cldr2qlocalexml.py
@@ -31,15 +31,17 @@
The CLDR data can be downloaded from CLDR_, which has a sub-directory
for each version; you need the ``core.zip`` file for your version of
-choice (typically the latest). This script has had updates to cope up
-to v35; for later versions, we may need adaptations. Unpack the
+choice (typically the latest). This script has had updates to cope up
+to v35; for later versions, we may need adaptations. Unpack the
downloaded ``core.zip`` and check it has a common/main/ sub-directory:
-pass the path of that sub-directory to this script as its single
-command-line argument. Save its standard output (but not error) to a
-file for later processing by ``./qlocalexml2cpp.py``
+pass the path of that root of the download to this script as its first
+command-line argument. Pass the name of the file in which to write
+output as the second argument; either omit it or use '-' to select the
+standard output. This file is the input needed by
+``./qlocalexml2cpp.py``
When you update the CLDR data, be sure to also update
-src/corelib/text/qt_attribution.json's entry for unicode-cldr. Check
+src/corelib/text/qt_attribution.json's entry for unicode-cldr. Check
this script's output for unknown language, country or script messages;
if any can be resolved, use their entry in common/main/en.xml to
append new entries to enumdata.py's lists and update documentation in
@@ -54,646 +56,67 @@ time zone names; see cldr2qtimezone.py for details.
import os
import sys
-import re
-import textwrap
-import enumdata
-import xpathlite
-from xpathlite import DraftResolution, findAlias, findEntry, findTagsInFile
-from dateconverter import convert_date
-from qlocalexml import Locale
-
-# TODO: make calendars a command-line option
-calendars = ['gregorian', 'persian', 'islamic'] # 'hebrew'
-findEntryInFile = xpathlite._findEntryInFile
-def wrappedwarn(prefix, tokens):
- return sys.stderr.write(
- '\n'.join(textwrap.wrap(prefix + ', '.join(tokens),
- subsequent_indent=' ', width=80)) + '\n')
-
-def parse_number_format(patterns, data):
- # this is a very limited parsing of the number format for currency only.
- def skip_repeating_pattern(x):
- p = x.replace('0', '#').replace(',', '').replace('.', '')
- seen = False
- result = ''
- for c in p:
- if c == '#':
- if seen:
- continue
- seen = True
- else:
- seen = False
- result = result + c
- return result
- patterns = patterns.split(';')
- result = []
- for pattern in patterns:
- pattern = skip_repeating_pattern(pattern)
- pattern = pattern.replace('#', "%1")
- # according to http://www.unicode.org/reports/tr35/#Number_Format_Patterns
- # there can be doubled or trippled currency sign, however none of the
- # locales use that.
- pattern = pattern.replace(u'\xa4', "%2")
- pattern = pattern.replace("''", "###").replace("'", '').replace("###", "'")
- pattern = pattern.replace('-', data['minus'])
- pattern = pattern.replace('+', data['plus'])
- result.append(pattern)
- return result
-
-def raiseUnknownCode(code, form, cache={}):
- """Check whether an unknown code could be supported.
-
- We declare a language, script or country code unknown if it's not
- known to enumdata.py; however, if it's present in main/en.xml's
- mapping of codes to names, we have the option of adding support.
- This caches the necessary look-up (so we only read main/en.xml
- once) and returns the name we should use if we do add support.
-
- First parameter, code, is the unknown code. Second parameter,
- form, is one of 'language', 'script' or 'country' to select the
- type of code to look up. Do not pass further parameters (the next
- will deprive you of the cache).
-
- Raises xpathlite.Error with a suitable message, that includes the
- unknown code's full name if found.
-
- Relies on global cldr_dir being set before it's called; see tail
- of this file.
- """
- if not cache:
- cache.update(xpathlite.codeMapsFromFile(os.path.join(cldr_dir, 'en.xml')))
- name = cache[form].get(code)
- msg = 'unknown %s code "%s"' % (form, code)
- if name:
- msg += ' - could use "%s"' % name
- raise xpathlite.Error(msg)
-
-def parse_list_pattern_part_format(pattern):
- # This is a very limited parsing of the format for list pattern part only.
- return pattern.replace("{0}", "%1").replace("{1}", "%2").replace("{2}", "%3")
-
-def unit_quantifiers(find, path, stem, suffix, known,
- # Stop at exa/exbi: 16 exbi = 2^{64} < zetta =
- # 1000^7 < zebi = 2^{70}, the next quantifiers up:
- si_quantifiers = ('kilo', 'mega', 'giga', 'tera', 'peta', 'exa')):
- """Work out the unit quantifiers.
-
- Unfortunately, the CLDR data only go up to terabytes and we want
- all the way to exabytes; but we can recognize the SI quantifiers
- as prefixes, strip and identify the tail as the localized
- translation for 'B' (e.g. French has 'octet' for 'byte' and uses
- ko, Mo, Go, To from which we can extrapolate Po, Eo).
-
- Should be called first for the SI quantifiers, with suffix = 'B',
- then for the IEC ones, with suffix = 'iB'; the list known
- (initially empty before first call) is used to let the second call
- know what the first learned about the localized unit.
- """
- if suffix == 'B': # first call, known = []
- tail = suffix
- for q in si_quantifiers:
- it = find(path, stem % q)
- # kB for kilobyte, in contrast with KiB for IEC:
- q = q[0] if q == 'kilo' else q[0].upper()
- if not it:
- it = q + tail
- elif it.startswith(q):
- rest = it[1:]
- tail = rest if all(rest == k for k in known) else suffix
- known.append(rest)
- yield it
- else: # second call, re-using first's known
- assert suffix == 'iB'
- if known:
- byte = known.pop()
- if all(byte == k for k in known):
- suffix = 'i' + byte
- for q in si_quantifiers:
- yield find(path, stem % q[:2],
- # Those don't (yet, v31) exist in CLDR, so we always fall back to:
- q[0].upper() + suffix)
-
-def generateLocaleInfo(path):
- if not path.endswith(".xml"):
- return {}
-
- # skip legacy/compatibility ones
- alias = findAlias(path)
- if alias:
- raise xpathlite.Error('alias to "%s"' % alias)
-
- def code(tag):
- return findEntryInFile(path, 'identity/' + tag, attribute="type")[0]
-
- return _generateLocaleInfo(path, code('language'), code('script'),
- code('territory'), code('variant'))
-
-def getNumberSystems(cache={}):
- """Cached look-up of number system information.
-
- Pass no arguments. Returns a mapping from number system names to,
- for each system, a mapping with keys u'digits', u'type' and
- u'id'\n"""
- if not cache:
- for ns in findTagsInFile(os.path.join(cldr_dir, '..', 'supplemental',
- 'numberingSystems.xml'),
- 'numberingSystems'):
- # ns has form: [u'numberingSystem', [(u'digits', u'0123456789'), (u'type', u'numeric'), (u'id', u'latn')]]
- entry = dict(ns[1])
- cache[entry[u'id']] = entry
- return cache
-
-def _generateLocaleInfo(path, language_code, script_code, country_code, variant_code=""):
- if not path.endswith(".xml"):
- return {}
-
- if language_code == 'root':
- # just skip it
- return {}
-
- # we do not support variants
- # ### actually there is only one locale with variant: en_US_POSIX
- # does anybody care about it at all?
- if variant_code:
- raise xpathlite.Error('we do not support variants ("%s")' % variant_code)
-
- language_id = enumdata.languageCodeToId(language_code)
- if language_id <= 0:
- raiseUnknownCode(language_code, 'language')
-
- script_id = enumdata.scriptCodeToId(script_code)
- if script_id == -1:
- raiseUnknownCode(script_code, 'script')
-
- # we should handle fully qualified names with the territory
- if not country_code:
- return {}
- country_id = enumdata.countryCodeToId(country_code)
- if country_id <= 0:
- raiseUnknownCode(country_code, 'country')
-
- # So we say we accept only those values that have "contributed" or
- # "approved" resolution. see http://www.unicode.org/cldr/process.html
- # But we only respect the resolution for new datas for backward
- # compatibility.
- draft = DraftResolution.contributed
-
- result = dict(
- language=enumdata.language_list[language_id][0],
- language_code=language_code, language_id=language_id,
- script=enumdata.script_list[script_id][0],
- script_code=script_code, script_id=script_id,
- country=enumdata.country_list[country_id][0],
- country_code=country_code, country_id=country_id,
- variant_code=variant_code)
-
- (dir_name, file_name) = os.path.split(path)
- def from_supplement(tag,
- path=os.path.join(dir_name, '..', 'supplemental',
- 'supplementalData.xml')):
- return findTagsInFile(path, tag)
- currencies = from_supplement('currencyData/region[iso3166=%s]' % country_code)
- result['currencyIsoCode'] = ''
- result['currencyDigits'] = 2
- result['currencyRounding'] = 1
- if currencies:
- for e in currencies:
- if e[0] == 'currency':
- t = [x[1] == 'false' for x in e[1] if x[0] == 'tender']
- if t and t[0]:
- pass
- elif not any(x[0] == 'to' for x in e[1]):
- result['currencyIsoCode'] = (x[1] for x in e[1] if x[0] == 'iso4217').next()
- break
- if result['currencyIsoCode']:
- t = from_supplement("currencyData/fractions/info[iso4217=%s]"
- % result['currencyIsoCode'])
- if t and t[0][0] == 'info':
- result['currencyDigits'] = (int(x[1]) for x in t[0][1] if x[0] == 'digits').next()
- result['currencyRounding'] = (int(x[1]) for x in t[0][1] if x[0] == 'rounding').next()
- numbering_system = None
- try:
- numbering_system = findEntry(path, "numbers/defaultNumberingSystem")
- except xpathlite.Error:
- pass
- def findEntryDef(path, xpath, value=''):
- try:
- return findEntry(path, xpath)
- except xpathlite.Error:
- return value
- def get_number_in_system(path, xpath, numbering_system):
- if numbering_system:
- try:
- return findEntry(path, xpath + "[numberSystem=" + numbering_system + "]")
- except xpathlite.Error:
- # in CLDR 1.9 number system was refactored for numbers (but not for currency)
- # so if previous findEntry doesn't work we should try this:
- try:
- return findEntry(path, xpath.replace("/symbols/", "/symbols[numberSystem=" + numbering_system + "]/"))
- except xpathlite.Error:
- # fallback to default
- pass
- return findEntry(path, xpath)
-
- result['decimal'] = get_number_in_system(path, "numbers/symbols/decimal", numbering_system)
- result['group'] = get_number_in_system(path, "numbers/symbols/group", numbering_system)
- assert result['decimal'] != result['group']
- result['list'] = get_number_in_system(path, "numbers/symbols/list", numbering_system)
- result['percent'] = get_number_in_system(path, "numbers/symbols/percentSign", numbering_system)
- try:
- digits = getNumberSystems()[numbering_system][u"digits"];
- assert len(digits) == 10 and all(ord(d) - i == ord(digits[0]) for i, d in enumerate(digits))
- result['zero'] = digits[0]
- except Exception as e:
- sys.stderr.write("Native zero detection problem: %s\n" % repr(e))
- result['zero'] = get_number_in_system(path, "numbers/symbols/nativeZeroDigit", numbering_system)
- result['minus'] = get_number_in_system(path, "numbers/symbols/minusSign", numbering_system)
- result['plus'] = get_number_in_system(path, "numbers/symbols/plusSign", numbering_system)
- result['exp'] = get_number_in_system(path, "numbers/symbols/exponential", numbering_system)
- result['quotationStart'] = findEntry(path, "delimiters/quotationStart")
- result['quotationEnd'] = findEntry(path, "delimiters/quotationEnd")
- result['alternateQuotationStart'] = findEntry(path, "delimiters/alternateQuotationStart")
- result['alternateQuotationEnd'] = findEntry(path, "delimiters/alternateQuotationEnd")
- result['listPatternPartStart'] = parse_list_pattern_part_format(findEntry(path, "listPatterns/listPattern/listPatternPart[start]"))
- result['listPatternPartMiddle'] = parse_list_pattern_part_format(findEntry(path, "listPatterns/listPattern/listPatternPart[middle]"))
- result['listPatternPartEnd'] = parse_list_pattern_part_format(findEntry(path, "listPatterns/listPattern/listPatternPart[end]"))
- result['listPatternPartTwo'] = parse_list_pattern_part_format(findEntry(path, "listPatterns/listPattern/listPatternPart[2]"))
- result['am'] = findEntry(path, "dates/calendars/calendar[gregorian]/dayPeriods/dayPeriodContext[format]/dayPeriodWidth[wide]/dayPeriod[am]", draft)
- result['pm'] = findEntry(path, "dates/calendars/calendar[gregorian]/dayPeriods/dayPeriodContext[format]/dayPeriodWidth[wide]/dayPeriod[pm]", draft)
- result['longDateFormat'] = convert_date(findEntry(path, "dates/calendars/calendar[gregorian]/dateFormats/dateFormatLength[full]/dateFormat/pattern"))
- result['shortDateFormat'] = convert_date(findEntry(path, "dates/calendars/calendar[gregorian]/dateFormats/dateFormatLength[short]/dateFormat/pattern"))
- result['longTimeFormat'] = convert_date(findEntry(path, "dates/calendars/calendar[gregorian]/timeFormats/timeFormatLength[full]/timeFormat/pattern"))
- result['shortTimeFormat'] = convert_date(findEntry(path, "dates/calendars/calendar[gregorian]/timeFormats/timeFormatLength[short]/timeFormat/pattern"))
-
- endonym = None
- if country_code and script_code:
- endonym = findEntryDef(path, "localeDisplayNames/languages/language[type=%s_%s_%s]" % (language_code, script_code, country_code))
- if not endonym and script_code:
- endonym = findEntryDef(path, "localeDisplayNames/languages/language[type=%s_%s]" % (language_code, script_code))
- if not endonym and country_code:
- endonym = findEntryDef(path, "localeDisplayNames/languages/language[type=%s_%s]" % (language_code, country_code))
- if not endonym:
- endonym = findEntryDef(path, "localeDisplayNames/languages/language[type=%s]" % (language_code))
- result['languageEndonym'] = endonym
- result['countryEndonym'] = findEntryDef(path, "localeDisplayNames/territories/territory[type=%s]" % (country_code))
-
- currency_format = get_number_in_system(path, "numbers/currencyFormats/currencyFormatLength/currencyFormat/pattern", numbering_system)
- currency_format = parse_number_format(currency_format, result)
- result['currencyFormat'] = currency_format[0]
- result['currencyNegativeFormat'] = ''
- if len(currency_format) > 1:
- result['currencyNegativeFormat'] = currency_format[1]
-
- result['currencySymbol'] = ''
- result['currencyDisplayName'] = ''
- if result['currencyIsoCode']:
- stem = "numbers/currencies/currency[%s]/" % result['currencyIsoCode']
- result['currencySymbol'] = findEntryDef(path, stem + 'symbol')
- displays = tuple(findEntryDef(path, stem + 'displayName' + tail)
- for tail in ('',) + tuple(
- '[count=%s]' % x for x in ('zero', 'one', 'two',
- 'few', 'many', 'other')))
- while displays and not displays[-1]:
- displays = displays[:-1]
- result['currencyDisplayName'] = ';'.join(displays)
-
- def findUnitDef(path, stem, fallback=''):
- # The displayName for a quantified unit in en.xml is kByte
- # instead of kB (etc.), so prefer any unitPattern provided:
- for count in ('many', 'few', 'two', 'other', 'zero', 'one'):
- try:
- ans = findEntry(path, stem + 'unitPattern[count=%s]' % count)
- except xpathlite.Error:
- continue
-
- # TODO: epxloit count-handling, instead of discarding placeholders
- if ans.startswith('{0}'):
- ans = ans[3:].lstrip()
- if ans:
- return ans
-
- return findEntryDef(path, stem + 'displayName', fallback)
-
- # First without quantifier, then quantified each way:
- result['byte_unit'] = findEntryDef(
- path, 'units/unitLength[type=long]/unit[type=digital-byte]/displayName',
- 'bytes')
- stem = 'units/unitLength[type=short]/unit[type=digital-%sbyte]/'
- known = [] # cases where we *do* have a given version:
- result['byte_si_quantified'] = ';'.join(unit_quantifiers(findUnitDef, path, stem, 'B', known))
- # IEC 60027-2
- # http://physics.nist.gov/cuu/Units/binary.html
- result['byte_iec_quantified'] = ';'.join(unit_quantifiers(findUnitDef, path, stem % '%sbi', 'iB', known))
-
- # Used for month and day data:
- namings = (
- ('standaloneLong', 'stand-alone', 'wide'),
- ('standaloneShort', 'stand-alone', 'abbreviated'),
- ('standaloneNarrow', 'stand-alone', 'narrow'),
- ('long', 'format', 'wide'),
- ('short', 'format', 'abbreviated'),
- ('narrow', 'format', 'narrow'),
- )
-
- # Month names for 12-month calendars:
- for cal in calendars:
- stem = 'dates/calendars/calendar[' + cal + ']/months/'
- for (key, mode, size) in namings:
- prop = 'monthContext[' + mode + ']/monthWidth[' + size + ']/'
- result[key + 'Months_' + cal] = ';'.join(
- findEntry(path, stem + prop + "month[%d]" % i)
- for i in range(1, 13))
-
- # Day data (for Gregorian, at least):
- stem = 'dates/calendars/calendar[gregorian]/days/'
- days = ('sun', 'mon', 'tue', 'wed', 'thu', 'fri', 'sat')
- for (key, mode, size) in namings:
- prop = 'dayContext[' + mode + ']/dayWidth[' + size + ']/day'
- result[key + 'Days'] = ';'.join(
- findEntry(path, stem + prop + '[' + day + ']')
- for day in days)
-
- return Locale(result)
-
-def addEscapes(s):
- result = ''
- for c in s:
- n = ord(c)
- if n < 128:
- result += c
- else:
- result += "\\x"
- result += "%02x" % (n)
- return result
-
-def unicodeStr(s):
- utf8 = s.encode('utf-8')
- return "<size>" + str(len(utf8)) + "</size><data>" + addEscapes(utf8) + "</data>"
-
-def usage():
- print "Usage: cldr2qlocalexml.py <path-to-cldr-main>"
- sys.exit()
-
-def integrateWeekData(filePath):
- if not filePath.endswith(".xml"):
- return {}
-
- def lookup(key):
- return findEntryInFile(filePath, key, attribute='territories')[0].split()
- days = ('mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun')
-
- firstDayByCountryCode = {}
- for day in days:
- for countryCode in lookup('weekData/firstDay[day=%s]' % day):
- firstDayByCountryCode[countryCode] = day
-
- weekendStartByCountryCode = {}
- for day in days:
- for countryCode in lookup('weekData/weekendStart[day=%s]' % day):
- weekendStartByCountryCode[countryCode] = day
-
- weekendEndByCountryCode = {}
- for day in days:
- for countryCode in lookup('weekData/weekendEnd[day=%s]' % day):
- weekendEndByCountryCode[countryCode] = day
-
- for (key, locale) in locale_database.iteritems():
- countryCode = locale.country_code
- if countryCode in firstDayByCountryCode:
- locale.firstDayOfWeek = firstDayByCountryCode[countryCode]
- else:
- locale.firstDayOfWeek = firstDayByCountryCode["001"]
-
- if countryCode in weekendStartByCountryCode:
- locale.weekendStart = weekendStartByCountryCode[countryCode]
- else:
- locale.weekendStart = weekendStartByCountryCode["001"]
-
- if countryCode in weekendEndByCountryCode:
- locale.weekendEnd = weekendEndByCountryCode[countryCode]
- else:
- locale.weekendEnd = weekendEndByCountryCode["001"]
-
-def splitLocale(name):
- """Split name into (language, script, territory) triple as generator.
-
- Ignores any trailing fields (with a warning), leaves script (a capitalised
- four-letter token) or territory (either a number or an all-uppercase token)
- empty if unspecified, returns a single-entry generator if name is a single
- tag (i.e. contains no underscores). Always yields 1 or 3 values, never 2."""
- tags = iter(name.split('_'))
- yield tags.next() # Language
- tag = tags.next()
-
- # Script is always four letters, always capitalised:
- if len(tag) == 4 and tag[0].isupper() and tag[1:].islower():
- yield tag
- try:
- tag = tags.next()
- except StopIteration:
- tag = ''
- else:
- yield ''
-
- # Territory is upper-case or numeric:
- if tag and tag.isupper() or tag.isdigit():
- yield tag
- tag = ''
+from localetools import Error
+from cldr import CldrReader
+from qlocalexml import QLocaleXmlWriter
+from enumdata import language_list, script_list, country_list
+
+def usage(name, err, message = ''):
+ err.write("""Usage: {} path/to/cldr/common/main [out-file.xml]
+""".format(name)) # TODO: expand command-line, improve help message
+ if message:
+ err.write('\n' + message + '\n')
+
+def main(args, out, err):
+ # TODO: make calendars a command-line option
+ calendars = ['gregorian', 'persian', 'islamic'] # 'hebrew'
+
+ # TODO: make argument parsing more sophisticated
+ name = args.pop(0)
+ if not args:
+ usage(name, err, 'Where is your CLDR data tree ?')
+ return 1
+
+ root = args.pop(0)
+ if not os.path.exists(os.path.join(root, 'common', 'main', 'root.xml')):
+ usage(name, err,
+ 'First argument is the root of the CLDR tree: found no common/main/root.xml under '
+ + root)
+ return 1
+
+ xml = args.pop(0) if args else None
+ if not xml or xml == '-':
+ emit = out
+ elif not xml.endswith('.xml'):
+ usage(name, err, 'Please use a .xml extension on your output file name, not ' + xml)
+ return 1
else:
- yield ''
-
- # If nothing is left, StopIteration will avoid the warning:
- tag = (tag if tag else tags.next(),)
- sys.stderr.write('Ignoring unparsed cruft %s in %s\n' % ('_'.join(tag + tuple(tags)), name))
-
-if len(sys.argv) != 2:
- usage()
-
-cldr_dir = sys.argv[1]
-
-if not os.path.isdir(cldr_dir):
- usage()
-
-cldr_files = os.listdir(cldr_dir)
-
-locale_database = {}
-
-# see http://www.unicode.org/reports/tr35/tr35-info.html#Default_Content
-defaultContent_locales = []
-for ns in findTagsInFile(os.path.join(cldr_dir, '..', 'supplemental',
- 'supplementalMetadata.xml'),
- 'metadata/defaultContent'):
- for data in ns[1:][0]:
- if data[0] == u"locales":
- defaultContent_locales += data[1].split()
-
-skips = []
-for file in defaultContent_locales:
- try:
- language_code, script_code, country_code = splitLocale(file)
- except ValueError:
- sys.stderr.write('skipping defaultContent locale "' + file + '" [neither two nor three tags]\n')
- continue
-
- if not (script_code or country_code):
- sys.stderr.write('skipping defaultContent locale "' + file + '" [second tag is neither script nor territory]\n')
- continue
-
- try:
- l = _generateLocaleInfo(cldr_dir + "/" + file + ".xml", language_code, script_code, country_code)
- if not l:
- skips.append(file)
- continue
- except xpathlite.Error as e:
- sys.stderr.write('skipping defaultContent locale "%s" (%s)\n' % (file, str(e)))
- continue
-
- locale_database[(l.language_id, l.script_id, l.country_id, l.variant_code)] = l
-
-if skips:
- wrappedwarn('skipping defaultContent locales [no locale info generated]: ', skips)
- skips = []
-
-for file in cldr_files:
- try:
- l = generateLocaleInfo(cldr_dir + "/" + file)
- if not l:
- skips.append(file)
- continue
- except xpathlite.Error as e:
- sys.stderr.write('skipping file "%s" (%s)\n' % (file, str(e)))
- continue
-
- locale_database[(l.language_id, l.script_id, l.country_id, l.variant_code)] = l
-
-if skips:
- wrappedwarn('skipping files [no locale info generated]: ', skips)
-
-integrateWeekData(cldr_dir+"/../supplemental/supplementalData.xml")
-locale_keys = locale_database.keys()
-locale_keys.sort()
-
-cldr_version = 'unknown'
-ldml = open(cldr_dir+"/../dtd/ldml.dtd", "r")
-for line in ldml:
- if 'version cldrVersion CDATA #FIXED' in line:
- cldr_version = line.split('"')[1]
-
-if sys.stdout.encoding != 'UTF-8' or (sys.stdout.encoding is None and sys.getdefaultencoding() != 'UTF-8'):
- reload(sys) # Weirdly, this gets a richer sys module than the plain import got us !
- sys.setdefaultencoding('UTF-8')
-
-print "<localeDatabase>"
-print " <version>" + cldr_version + "</version>"
-print " <languageList>"
-for id in enumdata.language_list:
- l = enumdata.language_list[id]
- print " <language>"
- print " <name>" + l[0] + "</name>"
- print " <id>" + str(id) + "</id>"
- print " <code>" + l[1] + "</code>"
- print " </language>"
-print " </languageList>"
-
-print " <scriptList>"
-for id in enumdata.script_list:
- l = enumdata.script_list[id]
- print " <script>"
- print " <name>" + l[0] + "</name>"
- print " <id>" + str(id) + "</id>"
- print " <code>" + l[1] + "</code>"
- print " </script>"
-print " </scriptList>"
-
-print " <countryList>"
-for id in enumdata.country_list:
- l = enumdata.country_list[id]
- print " <country>"
- print " <name>" + l[0] + "</name>"
- print " <id>" + str(id) + "</id>"
- print " <code>" + l[1] + "</code>"
- print " </country>"
-print " </countryList>"
-
-def _parseLocale(l):
- language = "AnyLanguage"
- script = "AnyScript"
- country = "AnyCountry"
-
- if l == "und":
- raise xpathlite.Error("we are treating unknown locale like C")
-
- parsed = splitLocale(l)
- language_code = parsed.next()
- script_code = country_code = ''
- try:
- script_code, country_code = parsed
- except ValueError:
- pass
-
- if language_code != "und":
- language_id = enumdata.languageCodeToId(language_code)
- if language_id == -1:
- raise xpathlite.Error('unknown language code "%s"' % language_code)
- language = enumdata.language_list[language_id][0]
-
- if script_code:
- script_id = enumdata.scriptCodeToId(script_code)
- if script_id == -1:
- raise xpathlite.Error('unknown script code "%s"' % script_code)
- script = enumdata.script_list[script_id][0]
-
- if country_code:
- country_id = enumdata.countryCodeToId(country_code)
- if country_id == -1:
- raise xpathlite.Error('unknown country code "%s"' % country_code)
- country = enumdata.country_list[country_id][0]
+ try:
+ emit = open(xml, 'w')
+ except IOError as e:
+ usage(name, err, 'Failed to open "{}" to write output to it\n'.format(xml))
+ return 1
- return (language, script, country)
+ if args:
+ usage(name, err, 'Too many arguments - excess: ' + ' '.join(args))
+ return 1
-skips = []
-print " <likelySubtags>"
-for ns in findTagsInFile(cldr_dir + "/../supplemental/likelySubtags.xml", "likelySubtags"):
- tmp = {}
- for data in ns[1:][0]: # ns looks like this: [u'likelySubtag', [(u'from', u'aa'), (u'to', u'aa_Latn_ET')]]
- tmp[data[0]] = data[1]
+ if emit.encoding != 'UTF-8' or (emit.encoding is None and sys.getdefaultencoding() != 'UTF-8'):
+ reload(sys) # Weirdly, this gets a richer sys module than the plain import got us !
+ sys.setdefaultencoding('UTF-8')
- try:
- from_language, from_script, from_country = _parseLocale(tmp[u"from"])
- to_language, to_script, to_country = _parseLocale(tmp[u"to"])
- except xpathlite.Error as e:
- if tmp[u'to'].startswith(tmp[u'from']) and str(e) == 'unknown language code "%s"' % tmp[u'from']:
- skips.append(tmp[u'to'])
- else:
- sys.stderr.write('skipping likelySubtag "%s" -> "%s" (%s)\n' % (tmp[u"from"], tmp[u"to"], str(e)))
- continue
- # substitute according to http://www.unicode.org/reports/tr35/#Likely_Subtags
- if to_country == "AnyCountry" and from_country != to_country:
- to_country = from_country
- if to_script == "AnyScript" and from_script != to_script:
- to_script = from_script
+ # TODO - command line options to tune choice of grumble and whitter:
+ reader = CldrReader(root, err.write, err.write)
+ writer = QLocaleXmlWriter(emit.write)
- print " <likelySubtag>"
- print " <from>"
- print " <language>" + from_language + "</language>"
- print " <script>" + from_script + "</script>"
- print " <country>" + from_country + "</country>"
- print " </from>"
- print " <to>"
- print " <language>" + to_language + "</language>"
- print " <script>" + to_script + "</script>"
- print " <country>" + to_country + "</country>"
- print " </to>"
- print " </likelySubtag>"
-print " </likelySubtags>"
-if skips:
- wrappedwarn('skipping likelySubtags (for unknown language codes): ', skips)
-print " <localeList>"
+ writer.version(reader.root.cldrVersion)
+ writer.enumData(language_list, script_list, country_list)
+ writer.likelySubTags(reader.likelySubTags())
+ writer.locales(reader.readLocales(calendars), calendars)
-Locale.C(calendars).toXml(calendars)
-for key in locale_keys:
- locale_database[key].toXml(calendars)
+ writer.close()
+ return 0
-print " </localeList>"
-print "</localeDatabase>"
+if __name__ == '__main__':
+ sys.exit(main(sys.argv, sys.stdout, sys.stderr))
diff --git a/util/locale_database/cldr2qtimezone.py b/util/locale_database/cldr2qtimezone.py
index 4c3609056d..70b5d1e69e 100755
--- a/util/locale_database/cldr2qtimezone.py
+++ b/util/locale_database/cldr2qtimezone.py
@@ -1,7 +1,7 @@
#!/usr/bin/env python2
#############################################################################
##
-## Copyright (C) 2019 The Qt Company Ltd.
+## Copyright (C) 2020 The Qt Company Ltd.
## Contact: https://www.qt.io/licensing/
##
## This file is part of the test suite of the Qt Toolkit.
@@ -34,59 +34,20 @@ the CLDR data. Pass its common/ directory as first parameter to this
script and the qtbase root directory as second parameter. It shall
update qtbase's src/corelib/time/qtimezoneprivate_data_p.h ready for
use.
-
-The XML structure is as follows:
-
- <supplementalData>
- <version number="$Revision:...$"/>
- <generation date="$Date:...$"/>
- <windowsZones>
- <mapTimezones otherVersion="..." typeVersion="...">
- <!-- (UTC-08:00) Pacific Time (US & Canada) -->
- <mapZone other="Pacific Standard Time" territory="001" type="America/Los_Angeles"/>
- <mapZone other="Pacific Standard Time" territory="CA" type="America/Vancouver America/Dawson America/Whitehorse"/>
- <mapZone other="Pacific Standard Time" territory="US" type="America/Los_Angeles America/Metlakatla"/>
- <mapZone other="Pacific Standard Time" territory="ZZ" type="PST8PDT"/>
- </mapTimezones>
- </windowsZones>
- </supplementalData>
"""
import os
-import sys
-import datetime
-import tempfile
-import enumdata
-import xpathlite
-from xpathlite import DraftResolution
import re
-import qlocalexml2cpp
+import datetime
+import textwrap
-findAlias = xpathlite.findAlias
-findEntry = xpathlite.findEntry
-findEntryInFile = xpathlite._findEntryInFile
-findTagsInFile = xpathlite.findTagsInFile
-unicode2hex = qlocalexml2cpp.unicode2hex
-wrap_list = qlocalexml2cpp.wrap_list
+from localetools import unicode2hex, wrap_list, Error, SourceFileEditor
+from cldr import CldrAccess
-class ByteArrayData:
- def __init__(self):
- self.data = []
- self.hash = {}
- def append(self, s):
- s = s + '\0'
- if s in self.hash:
- return self.hash[s]
+### Data that may need updates in response to new entries in the CLDR file ###
- lst = unicode2hex(s)
- index = len(self.data)
- if index > 65535:
- print "\n\n\n#error Data index is too big!"
- sys.stderr.write ("\n\n\nERROR: index exceeds the uint16 range! index = %d\n" % index)
- sys.exit(1)
- self.hash[s] = index
- self.data += lst
- return index
+# This script shall report the update you need, if this arises.
+# However, you may need to research the relevant zone's standard offset.
# List of currently known Windows IDs.
# If this script reports missing IDs, please add them here.
@@ -233,12 +194,6 @@ windowsIdList = (
(u'Yakutsk Standard Time', 32400),
)
-def windowsIdToKey(windowsId):
- for index, pair in enumerate(windowsIdList):
- if pair[0] == windowsId:
- return index + 1
- return 0
-
# List of standard UTC IDs to use. Not public so may be safely changed.
# Do not remove IDs, as each entry is part of the API/behavior guarantee.
# ( UTC Id, Offset Seconds )
@@ -285,94 +240,43 @@ utcIdList = (
(u'UTC+14:00', 50400),
)
-def usage():
- print "Usage: cldr2qtimezone.py <path to cldr core/common> <path to qtbase>"
- sys.exit()
-
-if len(sys.argv) != 3:
- usage()
-
-cldrPath = sys.argv[1]
-qtPath = sys.argv[2]
-
-if not os.path.isdir(cldrPath) or not os.path.isdir(qtPath):
- usage()
-
-windowsZonesPath = cldrPath + "/supplemental/windowsZones.xml"
-tempFileDir = qtPath
-dataFilePath = qtPath + "/src/corelib/time/qtimezoneprivate_data_p.h"
-
-if not (os.path.isfile(windowsZonesPath) and os.path.isfile(dataFilePath)):
- usage()
-
-cldr_version = 'unknown'
-ldml = open(cldrPath + "/dtd/ldml.dtd", "r")
-for line in ldml:
- if 'version cldrVersion CDATA #FIXED' in line:
- cldr_version = line.split('"')[1]
-
-# [[u'version', [(u'number', u'$Revision: 7825 $')]]]
-versionNumber = findTagsInFile(windowsZonesPath, "version")[0][1][0][1]
-
-mapTimezones = findTagsInFile(windowsZonesPath, "windowsZones/mapTimezones")
-
-defaultDict = {}
-windowsIdDict = {}
-
-if mapTimezones:
- badZones = set()
- for mapZone in mapTimezones:
- # [u'mapZone', [(u'territory', u'MH'), (u'other', u'UTC+12'), (u'type', u'Pacific/Majuro Pacific/Kwajalein')]]
- if mapZone[0] == u'mapZone':
- data = {}
- for attribute in mapZone[1]:
- if attribute[0] == u'other':
- data['windowsId'] = attribute[1]
- if attribute[0] == u'territory':
- data['countryCode'] = attribute[1]
- if attribute[0] == u'type':
- data['ianaList'] = attribute[1]
-
- data['windowsKey'] = windowsIdToKey(data['windowsId'])
- if data['windowsKey'] <= 0:
- badZones.add(data['windowsId'])
-
- countryId = 0
- if data['countryCode'] == u'001':
- defaultDict[data['windowsKey']] = data['ianaList']
- else:
- data['countryId'] = enumdata.countryCodeToId(data['countryCode'])
- if data['countryId'] < 0:
- raise xpathlite.Error("Unknown Country Code \"%s\"" % data['countryCode'])
- data['country'] = enumdata.country_list[data['countryId']][0]
- windowsIdDict[data['windowsKey'], data['countryId']] = data
- if badZones:
- sys.stderr.write('\n\t'.join(["\nUnknown Windows ID, please add:"] + sorted(badZones))
- + "\nto the windowIdList in cldr2qtimezone.py\n\n")
- raise xpathlite.Error("Unknown Windows IDs")
-
-print "Input file parsed, now writing data"
-
-GENERATED_BLOCK_START = "// GENERATED PART STARTS HERE\n"
-GENERATED_BLOCK_END = "// GENERATED PART ENDS HERE\n"
-
-# Create a temp file to write the new data into
-(newTempFile, newTempFilePath) = tempfile.mkstemp("qtimezone_data_p", dir=tempFileDir)
-newTempFile = os.fdopen(newTempFile, "w")
-
-# Open the old file and copy over the first non-generated section to the new file
-oldDataFile = open(dataFilePath, "r")
-s = oldDataFile.readline()
-while s and s != GENERATED_BLOCK_START:
- newTempFile.write(s)
- s = oldDataFile.readline()
-
-# Write out generated block start tag and warning
-newTempFile.write(GENERATED_BLOCK_START)
-newTempFile.write("""
+### End of data that may need updates in response to CLDR ###
+
+class ByteArrayData:
+ def __init__(self):
+ self.data = []
+ self.hash = {}
+
+ def append(self, s):
+ s = s + '\0'
+ if s in self.hash:
+ return self.hash[s]
+
+ lst = unicode2hex(s)
+ index = len(self.data)
+ if index > 0xffff:
+ raise Error('Index ({}) outside the uint16 range !'.format(index))
+ self.hash[s] = index
+ self.data += lst
+ return index
+
+ def write(self, out, name):
+ out('\nstatic const char {}[] = {{\n'.format(name))
+ out(wrap_list(self.data))
+ out('\n};\n')
+
+class ZoneIdWriter (SourceFileEditor):
+ def write(self, version, defaults, windowsIds):
+ self.__writeWarning(version)
+ windows, iana = self.__writeTables(self.writer.write, defaults, windowsIds)
+ windows.write(self.writer.write, 'windowsIdData')
+ iana.write(self.writer.write, 'ianaIdData')
+
+ def __writeWarning(self, version):
+ self.writer.write("""
/*
- This part of the file was generated on %s from the
- Common Locale Data Repository v%s supplemental/windowsZones.xml file %s
+ This part of the file was generated on {} from the
+ Common Locale Data Repository v{} file supplemental/windowsZones.xml
http://www.unicode.org/cldr/
@@ -380,80 +284,111 @@ newTempFile.write("""
edited) CLDR data; see qtbase/util/locale_database/.
*/
-""" % (str(datetime.date.today()), cldr_version, versionNumber) )
-
-windowsIdData = ByteArrayData()
-ianaIdData = ByteArrayData()
-
-# Write Windows/IANA table
-newTempFile.write("// Windows ID Key, Country Enum, IANA ID Index\n")
-newTempFile.write("static const QZoneData zoneDataTable[] = {\n")
-for index in sorted(windowsIdDict):
- data = windowsIdDict[index]
- newTempFile.write(" { %6d,%6d,%6d }, // %s / %s\n"
- % (data['windowsKey'],
- data['countryId'],
- ianaIdData.append(data['ianaList']),
- data['windowsId'],
- data['country']))
-newTempFile.write(" { 0, 0, 0 } // Trailing zeroes\n")
-newTempFile.write("};\n\n")
-
-print "Done Zone Data"
-
-# Write Windows ID key table
-newTempFile.write("// Windows ID Key, Windows ID Index, IANA ID Index, UTC Offset\n")
-newTempFile.write("static const QWindowsData windowsDataTable[] = {\n")
-for index, pair in enumerate(windowsIdList):
- newTempFile.write(" { %6d,%6d,%6d,%6d }, // %s\n"
- % (index + 1, windowsIdData.append(pair[0]),
- ianaIdData.append(defaultDict[index + 1]), pair[1], pair[0]))
-newTempFile.write(" { 0, 0, 0, 0 } // Trailing zeroes\n")
-newTempFile.write("};\n\n")
-
-print "Done Windows Data Table"
-
-# Write UTC ID key table
-newTempFile.write("// IANA ID Index, UTC Offset\n")
-newTempFile.write("static const QUtcData utcDataTable[] = {\n")
-for pair in utcIdList:
- newTempFile.write(" { %6d,%6d }, // %s\n"
- % (ianaIdData.append(pair[0]), pair[1], pair[0]))
-newTempFile.write(" { 0, 0 } // Trailing zeroes\n")
-newTempFile.write("};\n\n")
-
-print "Done UTC Data Table"
-
-# Write out Windows ID's data
-newTempFile.write("static const char windowsIdData[] = {\n")
-newTempFile.write(wrap_list(windowsIdData.data))
-newTempFile.write("\n};\n\n")
-
-# Write out IANA ID's data
-newTempFile.write("static const char ianaIdData[] = {\n")
-newTempFile.write(wrap_list(ianaIdData.data))
-newTempFile.write("\n};\n")
-
-print "Done ID Data Table"
-
-# Write out the end of generated block tag
-newTempFile.write(GENERATED_BLOCK_END)
-s = oldDataFile.readline()
-
-# Skip through the old generated data in the old file
-while s and s != GENERATED_BLOCK_END:
- s = oldDataFile.readline()
-
-# Now copy the rest of the original file into the new file
-s = oldDataFile.readline()
-while s:
- newTempFile.write(s)
- s = oldDataFile.readline()
-
-# Now close the old and new file, delete the old file and copy the new file in its place
-newTempFile.close()
-oldDataFile.close()
-os.remove(dataFilePath)
-os.rename(newTempFilePath, dataFilePath)
-
-print "Data generation completed, please check the new file at " + dataFilePath
+""".format(str(datetime.date.today()), version))
+
+ @staticmethod
+ def __writeTables(out, defaults, windowsIds):
+ windowsIdData, ianaIdData = ByteArrayData(), ByteArrayData()
+
+ # Write Windows/IANA table
+ out('// Windows ID Key, Country Enum, IANA ID Index\n')
+ out('static const QZoneData zoneDataTable[] = {\n')
+ for index, data in sorted(windowsIds.items()):
+ out(' {{ {:6d},{:6d},{:6d} }}, // {} / {}\n'.format(
+ data['windowsKey'], data['countryId'],
+ ianaIdData.append(data['ianaList']),
+ data['windowsId'], data['country']))
+ out(' { 0, 0, 0 } // Trailing zeroes\n')
+ out('};\n\n')
+
+ # Write Windows ID key table
+ out('// Windows ID Key, Windows ID Index, IANA ID Index, UTC Offset\n')
+ out('static const QWindowsData windowsDataTable[] = {\n')
+ for index, pair in enumerate(windowsIdList, 1):
+ out(' {{ {:6d},{:6d},{:6d},{:6d} }}, // {}\n'.format(
+ index,
+ windowsIdData.append(pair[0]),
+ ianaIdData.append(defaults[index]),
+ pair[1], pair[0]))
+ out(' { 0, 0, 0, 0 } // Trailing zeroes\n')
+ out('};\n\n')
+
+ # Write UTC ID key table
+ out('// IANA ID Index, UTC Offset\n')
+ out('static const QUtcData utcDataTable[] = {\n')
+ for pair in utcIdList:
+ out(' {{ {:6d},{:6d} }}, // {}\n'.format(
+ ianaIdData.append(pair[0]), pair[1], pair[0]))
+ out(' { 0, 0 } // Trailing zeroes\n')
+ out('};\n')
+
+ return windowsIdData, ianaIdData
+
+def usage(err, name, message=''):
+ err.write("""Usage: {} path/to/cldr/core/common path/to/qtbase
+""".format(name)) # TODO: more interesting message
+ if message:
+ err.write('\n' + message + '\n')
+
+def main(args, out, err):
+ """Parses CLDR's data and updates Qt's representation of it.
+
+ Takes sys.argv, sys.stdout, sys.stderr (or equivalents) as
+ arguments. Expects two command-line options: the root of the
+ unpacked CLDR data-file tree and the root of the qtbase module's
+ checkout. Updates QTimeZone's private data about Windows time-zone
+ IDs."""
+ name = args.pop(0)
+ if len(args) != 2:
+ usage(err, name, "Expected two arguments")
+ return 1
+
+ cldrPath = args.pop(0)
+ qtPath = args.pop(0)
+
+ if not os.path.isdir(qtPath):
+ usage(err, name, "No such Qt directory: " + qtPath)
+ return 1
+ if not os.path.isdir(cldrPath):
+ usage(err, name, "No such CLDR directory: " + cldrPath)
+ return 1
+
+ dataFilePath = os.path.join(qtPath, 'src', 'corelib', 'time', 'qtimezoneprivate_data_p.h')
+ if not os.path.isfile(dataFilePath):
+ usage(err, name, 'No such file: ' + dataFilePath)
+ return 1
+
+ try:
+ version, defaults, winIds = CldrAccess(cldrPath).readWindowsTimeZones(
+ dict((name, ind) for ind, name in enumerate((x[0] for x in windowsIdList), 1)))
+ except IOError as e:
+ usage(err, name,
+ 'Failed to open common/supplemental/windowsZones.xml: ' + (e.message or e.args[1]))
+ return 1
+ except Error as e:
+ err.write('\n'.join(textwrap.wrap(
+ 'Failed to read windowsZones.xml: ' + (e.message or e.args[1]),
+ subsequent_indent=' ', width=80)) + '\n')
+ return 1
+
+ out.write('Input file parsed, now writing data\n')
+ try:
+ writer = ZoneIdWriter(dataFilePath, qtPath)
+ except IOError as e:
+ err.write('Failed to open files to transcribe: {}'.format(e.message or e.args[1]))
+ return 1
+
+ try:
+ writer.write(version, defaults, winIds)
+ except Error as e:
+ writer.cleanup()
+ err.write('\nError in Windows ID data: ' + e.message + '\n')
+ return 1
+
+ writer.close()
+ out.write('Data generation completed, please check the new file at ' + dataFilePath + '\n')
+ return 0
+
+if __name__ == '__main__':
+ import sys
+ sys.exit(main(sys.argv, sys.stdout, sys.stderr))
diff --git a/util/locale_database/ldml.py b/util/locale_database/ldml.py
new file mode 100644
index 0000000000..e3e3a2e4ba
--- /dev/null
+++ b/util/locale_database/ldml.py
@@ -0,0 +1,589 @@
+#############################################################################
+##
+## Copyright (C) 2020 The Qt Company Ltd.
+## Contact: https://www.qt.io/licensing/
+##
+## This file is part of the test suite of the Qt Toolkit.
+##
+## $QT_BEGIN_LICENSE:GPL-EXCEPT$
+## Commercial License Usage
+## Licensees holding valid commercial Qt licenses may use this file in
+## accordance with the commercial license agreement provided with the
+## Software or, alternatively, in accordance with the terms contained in
+## a written agreement between you and The Qt Company. For licensing terms
+## and conditions see https://www.qt.io/terms-conditions. For further
+## information use the contact form at https://www.qt.io/contact-us.
+##
+## GNU General Public License Usage
+## Alternatively, this file may be used under the terms of the GNU
+## General Public License version 3 as published by the Free Software
+## Foundation with exceptions as appearing in the file LICENSE.GPL3-EXCEPT
+## included in the packaging of this file. Please review the following
+## information to ensure the GNU General Public License requirements will
+## be met: https://www.gnu.org/licenses/gpl-3.0.html.
+##
+## $QT_END_LICENSE$
+##
+#############################################################################
+"""Parsing the Locale Data Markup Language
+
+It's an XML format, so the raw parsing of XML is, of course, delegated
+to xml.dom.minidom; but it has its own specific schemata and some
+funky rules for combining data from various files (inheritance between
+locales). The use of it we're interested in is extraction of CLDR's
+data, so some of the material here is specific to CLDR; see cldr.py
+for how it is mainly used.
+
+Provides various classes to wrap xml.dom's objects, specifically those
+returned by minidom.parse() and their child-nodes:
+ Node -- wraps any node in the DOM tree
+ XmlScanner -- wraps the root element of a stand-alone XML file
+ Supplement -- specializes XmlScanner for supplemental data files
+ LocaleScanner -- wraps a locale's inheritance-chain of file roots
+
+See individual classes for further detail.
+"""
+from localetools import Error
+from dateconverter import convert_date
+
+class Node (object):
+ """Wrapper for an arbitrary DOM node.
+
+ Provides various ways to select chldren of a node. Selected child
+ nodes are returned wrapped as Node objects. A Node exposes the
+ raw DOM node it wraps via its .dom attribute."""
+
+ def __init__(self, elt, dullAttrs = None, draft = 0):
+ """Wraps a DOM node for ease of access.
+
+ First argument, elt, is the DOM node to wrap.
+
+ Optional second argument, dullAttrs, should either be None or
+ map each LDML tag name to a list of the names of
+ non-distinguishing attributes for nodes with the given tag
+ name. If None is given, no distinguishing attribute checks are
+ performed.
+
+ (Optional third argument, draft, should only be supplied by
+ this class's creation of child nodes; it is the maximum draft
+ score of any ancestor of the new node.)"""
+ self.dom, self.__dull = elt, dullAttrs
+ try:
+ attr = elt.attributes['draft'].nodeValue
+ except KeyError:
+ self.draft = draft
+ else:
+ self.draft = max(draft, self.draftScore(attr))
+
+ def findAllChildren(self, tag, wanted = None, allDull = False):
+ """All children that do have the given tag and attributes.
+
+ First argument is the tag: children with any other tag are
+ ignored.
+
+ Optional second argument, wanted, should either be None or map
+ attribute names to the values they must have. Only child nodes
+ with thes attributes set to the given values are yielded.
+
+ By default, nodes that have distinguishing attributes, other
+ than those specified in wanted, are ignored. Pass the allDull
+ parameter a true value to suppress this check."""
+
+ if self.__dull is None:
+ allDull = True
+ dull = () if allDull else self.__dull[tag]
+
+ for child in self.dom.childNodes:
+ if child.nodeType != child.ELEMENT_NODE:
+ continue
+ if child.nodeName != tag:
+ continue
+
+ if wanted:
+ try:
+ if any(child.attributes[k].nodeValue != v
+ for k, v in wanted.items()):
+ continue
+ except KeyError: # Some wanted attribute is missing
+ continue
+
+ if not (allDull or all(k in dull or k in wanted
+ for k in child.attributes.keys())):
+ continue
+
+ elif not (allDull or all(k in dull
+ for k in child.attributes.keys())):
+ continue
+
+ yield Node(child, self.__dull, self.draft)
+
+ def findUniqueChild(self, tag):
+ """Returns the single child with the given nodeName.
+
+ Raises Error if there is no such child or there is more than
+ one."""
+ seq = self.findAllChildren(tag)
+ try:
+ node = seq.next()
+ except StopIteration:
+ raise Error('No child found where one was expected', tag)
+ for it in seq:
+ raise Error('Many children found where only one was expected', tag)
+ return node
+
+ @classmethod
+ def draftScore(cls, level):
+ """Maps draft level names to numeric scores.
+
+ Single parameter, level, is the least sure value of the draft
+ attribute on a node that you're willing to accept; returns a
+ numeric value (lower is less drafty).
+
+ Tempting as it is to insist on low draft scores, there are
+ many locales in which pretty much every leaf is
+ unconfirmed. It may make sense to actually check each
+ XmlScanner object, or each node in each LocaleScanner's nodes
+ list, to see what its distribution of draft level looks like,
+ so as to set the acceptable draft score for its elements
+ accordingly. However, for the moment, we mostly just accept
+ all elements, regardless of draft values (the one exception is
+ am/pm indicators)."""
+ return cls.__draftScores.get(level, 5) if level else 0
+
+ # Implementation details:
+ __draftScores = dict(true = 4, unconfirmed = 3, provisional = 2,
+ contributed = 1, approved = 0, false = 0)
+
+def _parseXPath(selector):
+ # Split "tag[attr=val][...]" into tag-name and attribute mapping
+ attrs = selector.split('[')
+ name = attrs.pop(0)
+ if attrs:
+ attrs = [x.strip() for x in attrs]
+ assert all(x.endswith(']') for x in attrs)
+ attrs = [x[:-1].split('=') for x in attrs]
+ assert all(len(x) in (1, 2) for x in attrs)
+ attrs = (('type', x[0]) if len(x) == 1 else x for x in attrs)
+ return name, dict(attrs)
+
+def _iterateEach(iters):
+ # Flatten a two-layer iterator.
+ for it in iters:
+ for item in it:
+ yield item
+
+class XmlScanner (object):
+ """Wrap an XML file to enable XPath access to its nodes.
+ """
+ def __init__(self, node):
+ self.root = node
+
+ def findNodes(self, xpath):
+ """Return all nodes under self.root matching this xpath.
+
+ Ignores any excess attributes."""
+ elts = (self.root,)
+ for selector in xpath.split('/'):
+ tag, attrs = _parseXPath(selector)
+ elts = tuple(_iterateEach(e.findAllChildren(tag, attrs) for e in elts))
+ if not elts:
+ break
+ return elts
+
+class Supplement (XmlScanner):
+ def find(self, xpath):
+ elts = self.findNodes(xpath)
+ for elt in _iterateEach(e.dom.childNodes if e.dom.childNodes else (e.dom,)
+ for e in elts):
+ if elt.attributes:
+ yield (elt.nodeName,
+ dict((k, v if isinstance(v, basestring) else v.nodeValue)
+ for k, v in elt.attributes.items()))
+
+class LocaleScanner (object):
+ def __init__(self, name, nodes, root):
+ self.name, self.nodes, self.base = name, nodes, root
+
+ def find(self, xpath, default = None, draft = None):
+ """XPath search for the content of an element.
+
+ Required argument, xpath, is the XPath to search for. Optional
+ second argument is a default value to use, if no such node is
+ found. Optional third argument is a draft score (see
+ Node.draftScore() for details); if given, leaf elements with
+ higher draft scores are ignored."""
+ try:
+ for elt in self.__find(xpath):
+ try:
+ if draft is None or elt.draft <= draft:
+ return elt.dom.firstChild.nodeValue
+ except (AttributeError, KeyError):
+ pass
+ except Error as e:
+ if default is None:
+ raise
+ return default
+
+ def tagCodes(self):
+ """Yields four tag codes
+
+ The tag codes are language, script, country and variant; an
+ empty value for any of them indicates that no value was
+ provided. The values are obtained from the primary file's
+ top-level <identity> element. An Error is raised if any
+ top-level <alias> element of this file has a non-empty source
+ attribute; that attribute value is mentioned in the error's
+ message."""
+ root = self.nodes[0]
+ for alias in root.findAllChildren('alias', allDull=True):
+ try:
+ source = alias.dom.attributes['source'].nodeValue
+ except (KeyError, AttributeError):
+ pass
+ else:
+ raise Error('Alias to {}'.format(source))
+
+ ids = root.findUniqueChild('identity')
+ for code in ('language', 'script', 'territory', 'variant'):
+ for node in ids.findAllChildren(code, allDull=True):
+ try:
+ yield node.dom.attributes['type'].nodeValue
+ except (KeyError, AttributeError):
+ pass
+ else:
+ break # only want one value for each code
+ else: # No value for this code, use empty
+ yield ''
+
+ def currencyData(self, isoCode):
+ """Fetches currency data for this locale.
+
+ Single argument, isoCode, is the ISO currency code for the
+ currency in use in the country. See also numericData, which
+ includes some currency formats.
+ """
+ if isoCode:
+ stem = 'numbers/currencies/currency[{}]/'.format(isoCode)
+ symbol = self.find(stem + 'symbol', '')
+ displays = tuple(self.find(stem + 'displayName' + tail, '')
+ for tail in ('',) + tuple(
+ '[count={}]'.format(x) for x in ('zero', 'one', 'two',
+ 'few', 'many', 'other')))
+ while displays and not displays[-1]:
+ displays = displays[:-1]
+ name = ';'.join(displays)
+ else:
+ symbol = name = ''
+ yield 'currencySymbol', symbol
+ yield 'currencyDisplayName', name
+
+ def numericData(self, lookup, complain = lambda text: None):
+ """Generate assorted numeric data for the locale.
+
+ First argument, lookup, is a callable that maps a numbering
+ system's name to certain data about the system, as a mapping;
+ we expect this to have u'digits' as a key.
+ """
+ system = self.find('numbers/defaultNumberingSystem')
+ stem = 'numbers/symbols[numberSystem={}]/'.format(system)
+ decimal = self.find(stem + 'decimal')
+ group = self.find(stem + 'group')
+ assert decimal != group, (self.name, system, decimal)
+ yield 'decimal', decimal
+ yield 'group', group
+ yield 'percent', self.find(stem + 'percentSign')
+ yield 'list', self.find(stem + 'list')
+ yield 'exp', self.find(stem + 'exponential')
+
+ digits = lookup(system)['digits']
+ assert len(digits) == 10
+ zero = digits[0]
+ # Qt's number-formatting code assumes digits are consecutive:
+ assert all(ord(c) == i for i, c in enumerate(digits, ord(zero)))
+ yield 'zero', zero
+
+ plus = self.find(stem + 'plusSign')
+ minus = self.find(stem + 'minusSign')
+ yield 'plus', plus
+ yield 'minus', minus
+
+ # Currency formatting:
+ xpath = 'numbers/currencyFormats/currencyFormatLength/currencyFormat[accounting]/pattern'
+ try:
+ money = self.find(xpath.replace('Formats/',
+ 'Formats[numberSystem={}]/'.format(system)))
+ except Error:
+ money = self.find(xpath)
+ money = self.__currencyFormats(money, plus, minus)
+ yield 'currencyFormat', money.next()
+ neg = ''
+ for it in money:
+ assert not neg, 'There should be at most one more pattern'
+ neg = it
+ yield 'currencyNegativeFormat', neg
+
+ def textPatternData(self):
+ for key in ('quotationStart', 'alternateQuotationEnd',
+ 'quotationEnd', 'alternateQuotationStart'):
+ yield key, self.find('delimiters/' + key)
+
+ for key in ('start', 'middle', 'end'):
+ yield ('listPatternPart' + key.capitalize(),
+ self.__fromLdmlListPattern(self.find(
+ 'listPatterns/listPattern/listPatternPart[{}]'.format(key))))
+ yield ('listPatternPartTwo',
+ self.__fromLdmlListPattern(self.find(
+ 'listPatterns/listPattern/listPatternPart[2]')))
+
+ stem = 'dates/calendars/calendar[gregorian]/'
+ # TODO: is wide really the right width to use here ?
+ # abbreviated might be an option ... or try both ?
+ meridiem = stem + 'dayPeriods/dayPeriodContext[format]/dayPeriodWidth[wide]/'
+ for key in ('am', 'pm'):
+ yield key, self.find(meridiem + 'dayPeriod[{}]'.format(key),
+ draft = Node.draftScore('contributed'))
+
+ for pair in (('long', 'full'), ('short', 'short')):
+ for key in ('time', 'date'):
+ yield (pair[0] + key.capitalize() + 'Format',
+ convert_date(self.find(
+ stem + '{}Formats/{}FormatLength[{}]/{}Format/pattern'.format(
+ key, key, pair[1], key))))
+
+ def endonyms(self, language, script, country, variant):
+ # TODO: take variant into account ?
+ for seq in ((language, script, country),
+ (language, script), (language, country), (language,)):
+ if not all(seq):
+ continue
+ try:
+ yield ('languageEndonym',
+ self.find('localeDisplayNames/languages/language[{}]'
+ .format('_'.join(seq))))
+ except Error:
+ pass
+ else:
+ break
+ else:
+ # grumble(failed to find endonym for language)
+ yield 'languageEndonym', ''
+
+ yield ('countryEndonym',
+ self.find('localeDisplayNames/territories/territory[{}]'
+ .format(country), ''))
+
+ def unitData(self):
+ yield ('byte_unit',
+ self.find('units/unitLength[long]/unit[digital-byte]/displayName',
+ 'bytes'))
+
+ unit = self.__findUnit('', 'B')
+ cache = [] # Populated by the SI call, to give hints to the IEC call
+ yield ('byte_si_quantified',
+ ';'.join(self.__unitCount('', unit, cache)))
+ # IEC 60027-2
+ # http://physics.nist.gov/cuu/Units/binary.html
+ yield ('byte_iec_quantified',
+ ';'.join(self.__unitCount('bi', 'iB', cache)))
+
+ def calendarNames(self, calendars):
+ namings = self.__nameForms
+ for cal in calendars:
+ stem = 'dates/calendars/calendar[' + cal + ']/months/'
+ for key, mode, size in namings:
+ prop = 'monthContext[' + mode + ']/monthWidth[' + size + ']/'
+ yield (key + 'Months_' + cal,
+ ';'.join(self.find(stem + prop + 'month[{}]'.format(i))
+ for i in range(1, 13)))
+
+ # Day data (for Gregorian, at least):
+ stem = 'dates/calendars/calendar[gregorian]/days/'
+ days = ('sun', 'mon', 'tue', 'wed', 'thu', 'fri', 'sat')
+ for (key, mode, size) in namings:
+ prop = 'dayContext[' + mode + ']/dayWidth[' + size + ']/day'
+ yield (key + 'Days',
+ ';'.join(self.find(stem + prop + '[' + day + ']')
+ for day in days))
+
+ # Implementation details
+ __nameForms = (
+ ('standaloneLong', 'stand-alone', 'wide'),
+ ('standaloneShort', 'stand-alone', 'abbreviated'),
+ ('standaloneNarrow', 'stand-alone', 'narrow'),
+ ('long', 'format', 'wide'),
+ ('short', 'format', 'abbreviated'),
+ ('narrow', 'format', 'narrow'),
+ ) # Used for month and day names
+
+ def __find(self, xpath):
+ retries = [ xpath.split('/') ]
+ while retries:
+ tags, elts, roots = retries.pop(), self.nodes, (self.base.root,)
+ for selector in tags:
+ tag, attrs = _parseXPath(selector)
+ elts = tuple(_iterateEach(e.findAllChildren(tag, attrs) for e in elts))
+ if not elts:
+ break
+
+ else: # Found matching elements
+ # Possibly filter elts to prefer the least drafty ?
+ for elt in elts:
+ yield elt
+
+ # Process roots separately: otherwise the alias-processing
+ # is excessive.
+ for i, selector in enumerate(tags):
+ tag, attrs = _parseXPath(selector)
+
+ for alias in tuple(_iterateEach(r.findAllChildren('alias', allDull=True)
+ for r in roots)):
+ if alias.dom.attributes['source'].nodeValue == 'locale':
+ replace = alias.dom.attributes['path'].nodeValue.split('/')
+ retries.append(self.__xpathJoin(tags[:i], replace, tags[i:]))
+
+ roots = tuple(_iterateEach(r.findAllChildren(tag, attrs) for r in roots))
+ if not roots:
+ if retries: # Let outer loop fall back on an alias path:
+ break
+ sought = '/'.join(tags)
+ if sought != xpath:
+ sought += ' (for {})'.format(xpath)
+ raise Error('All lack child {} for {} in {}'.format(
+ selector, sought, self.name))
+
+ else: # Found matching elements
+ for elt in roots:
+ yield elt
+
+ sought = '/'.join(tags)
+ if sought != xpath:
+ sought += ' (for {})'.format(xpath)
+ raise Error('No {} in {}'.format(sought, self.name))
+
+ def __findUnit(self, keySuffix, quantify, fallback=''):
+ # The displayName for a quantified unit in en.xml is kByte
+ # (even for unitLength[narrow]) instead of kB (etc.), so
+ # prefer any unitPattern provided, but prune its placeholder:
+ for size in ('short', 'narrow'): # TODO: reverse order ?
+ stem = 'units/unitLength[{}]/unit[digital-{}byte]/'.format(size + keySuffix, quantify)
+ for count in ('many', 'few', 'two', 'other', 'zero', 'one'):
+ try:
+ ans = self.find(stem + 'unitPattern[count={}]'.format(count))
+ except Error:
+ continue
+
+ # TODO: do count-handling, instead of discarding placeholders
+ if False: # TODO: do it this way, instead !
+ ans = ans.replace('{0}', '').strip()
+ elif ans.startswith('{0}'):
+ ans = ans[3:].lstrip()
+ if ans:
+ return ans
+
+ try:
+ return self.find(stem + 'displayName')
+ except Error:
+ pass
+
+ return fallback
+
+ def __unitCount(self, keySuffix, suffix, cache,
+ # Stop at exa/exbi: 16 exbi = 2^{64} < zetta =
+ # 1000^7 < zebi = 2^{70}, the next quantifiers up:
+ siQuantifiers = ('kilo', 'mega', 'giga', 'tera', 'peta', 'exa')):
+ """Work out the unit quantifiers.
+
+ Unfortunately, the CLDR data only go up to terabytes and we
+ want all the way to exabytes; but we can recognize the SI
+ quantifiers as prefixes, strip and identify the tail as the
+ localized translation for 'B' (e.g. French has 'octet' for
+ 'byte' and uses ko, Mo, Go, To from which we can extrapolate
+ Po, Eo).
+
+ Should be called first for the SI quantifiers, with suffix =
+ 'B', then for the IEC ones, with suffix = 'iB'; the list cache
+ (initially empty before first call) is used to let the second
+ call know what the first learned about the localized unit.
+ """
+ if suffix == 'iB': # second call, re-using first's cache
+ if cache:
+ byte = cache.pop()
+ if all(byte == k for k in cache):
+ suffix = 'i' + byte
+ for q in siQuantifiers:
+ # Those don't (yet, v36) exist in CLDR, so we always get the fall-back:
+ yield self.__findUnit(keySuffix, q[:2], q[0].upper() + suffix)
+ else: # first call
+ tail = suffix = suffix or 'B'
+ for q in siQuantifiers:
+ it = self.__findUnit(keySuffix, q)
+ # kB for kilobyte, in contrast with KiB for IEC:
+ q = q[0] if q == 'kilo' else q[0].upper()
+ if not it:
+ it = q + tail
+ elif it.startswith(q):
+ rest = it[1:]
+ tail = rest if all(rest == k for k in cache) else suffix
+ cache.append(rest)
+ yield it
+
+ @staticmethod
+ def __currencyFormats(patterns, plus, minus):
+ for p in patterns.split(';'):
+ p = p.replace('0', '#').replace(',', '').replace('.', '')
+ try:
+ cut = p.find('#') + 1
+ except ValueError:
+ pass
+ else:
+ p = p[:cut] + p[cut:].replace('#', '')
+ p = p.replace('#', "%1")
+ # According to http://www.unicode.org/reports/tr35/#Number_Format_Patterns
+ # there can be doubled or trippled currency sign, however none of the
+ # locales use that.
+ p = p.replace(u'\xa4', "%2")
+ # Single quote goes away, but double goes to single:
+ p = p.replace("''", '###').replace("'", '').replace('###', "'")
+ # Use number system's signs:
+ p = p.replace('+', plus).replace('-', minus)
+ yield p
+
+ @staticmethod
+ def __fromLdmlListPattern(pattern):
+ # This is a very limited parsing of the format for list pattern part only.
+ return pattern.replace('{0}', '%1').replace('{1}', '%2').replace('{2}', '%3')
+
+ @staticmethod
+ def __fromLdmlPath(seq): # tool function for __xpathJoin()
+ """Convert LDML's [@name='value'] to our [name=value] form."""
+ for it in seq:
+ # First dismember it:
+ attrs = it.split('[')
+ tag = attrs.pop(0)
+ if not attrs: # Short-cut the easy case:
+ yield it
+ continue
+
+ assert all(x.endswith(']') for x in attrs)
+ attrs = [x[:-1].split('=') for x in attrs]
+ # Then fix each attribute specification in it:
+ attrs = [(x[0][1:] if x[0].startswith('@') else x[0],
+ x[1][1:-1] if x[1].startswith("'") and x[1].endswith("'") else x[1])
+ for x in attrs]
+ # Finally, put it all back together:
+ attrs = ['='.join(x) + ']' for x in attrs]
+ attrs.insert(0, tag)
+ yield '['.join(attrs)
+
+ @classmethod
+ def __xpathJoin(cls, head, insert, tail):
+ """Join three lists of XPath selectors.
+
+ Each of head, insert and tail is a sequence of selectors but
+ insert may start with some uses of '..', that we want to
+ resolve away, and may use LDML's attribute format, that we
+ want to convert to our format."""
+ while insert and insert[0] == '..':
+ insert.pop(0)
+ head.pop()
+ return head + list(cls.__fromLdmlPath(insert)) + tail
diff --git a/util/locale_database/localetools.py b/util/locale_database/localetools.py
new file mode 100644
index 0000000000..29153366b3
--- /dev/null
+++ b/util/locale_database/localetools.py
@@ -0,0 +1,164 @@
+#############################################################################
+##
+## Copyright (C) 2020 The Qt Company Ltd.
+## Contact: https://www.qt.io/licensing/
+##
+## This file is part of the test suite of the Qt Toolkit.
+##
+## $QT_BEGIN_LICENSE:GPL-EXCEPT$
+## Commercial License Usage
+## Licensees holding valid commercial Qt licenses may use this file in
+## accordance with the commercial license agreement provided with the
+## Software or, alternatively, in accordance with the terms contained in
+## a written agreement between you and The Qt Company. For licensing terms
+## and conditions see https://www.qt.io/terms-conditions. For further
+## information use the contact form at https://www.qt.io/contact-us.
+##
+## GNU General Public License Usage
+## Alternatively, this file may be used under the terms of the GNU
+## General Public License version 3 as published by the Free Software
+## Foundation with exceptions as appearing in the file LICENSE.GPL3-EXCEPT
+## included in the packaging of this file. Please review the following
+## information to ensure the GNU General Public License requirements will
+## be met: https://www.gnu.org/licenses/gpl-3.0.html.
+##
+## $QT_END_LICENSE$
+##
+#############################################################################
+"""Utilities shared among the CLDR extraction tools.
+
+Functions:
+ unicode2hex() -- converts unicode text to UCS-2 in hex form.
+ wrap_list() -- map list to comma-separated string, 20 entries per line.
+
+Classes:
+ Error -- A shared error class.
+ Transcriber -- edit a file by writing a temporary file, then renaming.
+ SourceFileEditor -- adds standard prelude and tail handling to Transcriber.
+"""
+
+import os
+import tempfile
+
+class Error (StandardError):
+ __upinit = StandardError.__init__
+ def __init__(self, msg, *args):
+ self.__upinit(msg, *args)
+ self.message = msg
+ def __str__(self):
+ return self.message
+
+def unicode2hex(s):
+ lst = []
+ for x in s:
+ v = ord(x)
+ if v > 0xFFFF:
+ # make a surrogate pair
+ # copied from qchar.h
+ high = (v >> 10) + 0xd7c0
+ low = (v % 0x400 + 0xdc00)
+ lst.append(hex(high))
+ lst.append(hex(low))
+ else:
+ lst.append(hex(v))
+ return lst
+
+def wrap_list(lst):
+ def split(lst, size):
+ while lst:
+ head, lst = lst[:size], lst[size:]
+ yield head
+ return ",\n".join(", ".join(x) for x in split(lst, 20))
+
+class Transcriber (object):
+ """Helper class to facilitate rewriting source files.
+
+ This class takes care of the temporary file manipulation. Derived
+ classes need to implement transcribing of the content, with
+ whatever modifications they may want. Members reader and writer
+ are exposed; use writer.write() to output to the new file; use
+ reader.readline() or iterate reader to read the original.
+
+ Callers should call close() on success or cleanup() on failure (to
+ clear away the temporary file).
+ """
+ def __init__(self, path, temp):
+ # Open the old file
+ self.reader = open(path)
+ # Create a temp file to write the new data into
+ temp, tempPath = tempfile.mkstemp(os.path.split(path)[1], dir = temp)
+ self.__names = path, tempPath
+ self.writer = os.fdopen(temp, "w")
+
+ def close(self):
+ self.reader.close()
+ self.writer.close()
+ self.reader = self.writer = None
+ source, temp = self.__names
+ os.remove(source)
+ os.rename(temp, source)
+
+ def cleanup(self):
+ if self.__names:
+ self.reader.close()
+ self.writer.close()
+ # Remove temp-file:
+ os.remove(self.__names[1])
+ self.__names = ()
+
+class SourceFileEditor (Transcriber):
+ """Transcriber with transcription of code around a gnerated block.
+
+ We have a common pattern of source files with a generated part
+ embedded in a context that's not touched by the regeneration
+ scripts. The generated part is, in each case, marked with a common
+ pair of start and end markers. We transcribe the old file to a new
+ temporary file; on success, we then remove the original and move
+ the new version to replace it.
+
+ This class takes care of transcribing the parts before and after
+ the generated content; on creation, an instance will copy the
+ preamble up to the start marker; its close() will skip over the
+ original's generated content and resume transcribing with the end
+ marker. Derived classes need only implement the generation of the
+ content in between.
+
+ Callers should call close() on success or cleanup() on failure (to
+ clear away the temporary file); see Transcriber.
+ """
+ __upinit = Transcriber.__init__
+ def __init__(self, path, temp):
+ """Set up the source file editor.
+
+ Requires two arguments: the path to the source file to be read
+ and, on success, replaced with a new version; and the
+ directory in which to store the temporary file during the
+ rewrite."""
+ self.__upinit(path, temp)
+ self.__copyPrelude()
+
+ __upclose = Transcriber.close
+ def close(self):
+ self.__copyTail()
+ self.__upclose()
+
+ # Implementation details:
+ GENERATED_BLOCK_START = '// GENERATED PART STARTS HERE'
+ GENERATED_BLOCK_END = '// GENERATED PART ENDS HERE'
+
+ def __copyPrelude(self):
+ # Copy over the first non-generated section to the new file
+ for line in self.reader:
+ self.writer.write(line)
+ if line.strip() == self.GENERATED_BLOCK_START:
+ break
+
+ def __copyTail(self):
+ # Skip through the old generated data in the old file
+ for line in self.reader:
+ if line.strip() == self.GENERATED_BLOCK_END:
+ self.writer.write(line)
+ break
+ # Transcribe the remainder:
+ for line in self.reader:
+ self.writer.write(line)
diff --git a/util/locale_database/qlocalexml.py b/util/locale_database/qlocalexml.py
index 0a4628e05e..550021ba01 100644
--- a/util/locale_database/qlocalexml.py
+++ b/util/locale_database/qlocalexml.py
@@ -28,11 +28,18 @@
#############################################################################
"""Shared serialization-scanning code for QLocaleXML format.
-The Locale class is written by cldr2qlocalexml.py and read by qlocalexml2cpp.py
+Provides classes:
+ Locale -- common data-type representing one locale as a namespace
+ QLocaleXmlWriter -- helper to write a QLocaleXML file
+ QLocaleXmlReader -- helper to read a QLocaleXML file back in
+
+Support:
+ Spacer -- provides control over indentation of the output.
"""
+from __future__ import print_function
from xml.sax.saxutils import escape
-import xpathlite
+from localetools import Error
# Tools used by Locale:
def camel(seq):
@@ -43,6 +50,10 @@ def camel(seq):
def camelCase(words):
return ''.join(camel(iter(words)))
+def addEscapes(s):
+ return ''.join(c if n < 128 else '\\x{:02x}'.format(n)
+ for n, c in ((ord(c), c) for c in s))
+
def startCount(c, text): # strspn
"""First index in text where it doesn't have a character in c"""
assert text and text[0] in c
@@ -58,6 +69,8 @@ def convertFormat(format):
* https://www.unicode.org/reports/tr35/tr35-dates.html#Date_Field_Symbol_Table
* QDateTimeParser::parseFormat() and QLocalePrivate::dateTimeToString()
"""
+ # Compare and contrast dateconverter.py's convert_date().
+ # Need to (check consistency and) reduce redundancy !
result = ""
i = 0
while i < len(format):
@@ -102,7 +115,314 @@ def convertFormat(format):
return result
-class Locale:
+class QLocaleXmlReader (object):
+ def __init__(self, filename):
+ self.root = self.__parse(filename)
+ # Lists of (id, name, code) triples:
+ languages = tuple(self.__loadMap('language'))
+ scripts = tuple(self.__loadMap('script'))
+ countries = tuple(self.__loadMap('country'))
+ self.__likely = tuple(self.__likelySubtagsMap())
+ # Mappings {ID: (name, code)}
+ self.languages = dict((v[0], v[1:]) for v in languages)
+ self.scripts = dict((v[0], v[1:]) for v in scripts)
+ self.countries = dict((v[0], v[1:]) for v in countries)
+ # Private mappings {name: (ID, code)}
+ self.__langByName = dict((v[1], (v[0], v[2])) for v in languages)
+ self.__textByName = dict((v[1], (v[0], v[2])) for v in scripts)
+ self.__landByName = dict((v[1], (v[0], v[2])) for v in countries)
+ # Other properties:
+ self.dupes = set(v[1] for v in languages) & set(v[1] for v in countries)
+ self.cldrVersion = self.__firstChildText(self.root, "version")
+
+ def loadLocaleMap(self, calendars, grumble = lambda text: None):
+ kid = self.__firstChildText
+ likely = dict(self.__likely)
+ for elt in self.__eachEltInGroup(self.root, 'localeList', 'locale'):
+ locale = Locale.fromXmlData(lambda k: kid(elt, k), calendars)
+ language = self.__langByName[locale.language][0]
+ script = self.__textByName[locale.script][0]
+ country = self.__landByName[locale.country][0]
+
+ if language != 1: # C
+ if country == 0:
+ grumble('loadLocaleMap: No country id for "{}"\n'.format(locale.language))
+
+ if script == 0:
+ # Find default script for the given language and country - see:
+ # http://www.unicode.org/reports/tr35/#Likely_Subtags
+ try:
+ try:
+ to = likely[(locale.language, 'AnyScript', locale.country)]
+ except KeyError:
+ to = likely[(locale.language, 'AnyScript', 'AnyCountry')]
+ except KeyError:
+ pass
+ else:
+ locale.script = to[1]
+ script = self.__textByName[locale.script][0]
+
+ yield (language, script, country), locale
+
+ def languageIndices(self, locales):
+ index = 0
+ for key, value in self.languages.iteritems():
+ i, count = 0, locales.count(key)
+ if count > 0:
+ i = index
+ index += count
+ yield i, value[0]
+
+ def likelyMap(self):
+ def tag(t):
+ lang, script, land = t
+ yield lang[1] if lang[0] else 'und'
+ if script[0]: yield script[1]
+ if land[0]: yield land[1]
+
+ def ids(t):
+ return tuple(x[0] for x in t)
+
+ for i, pair in enumerate(self.__likely, 1):
+ have = self.__fromNames(pair[0])
+ give = self.__fromNames(pair[1])
+ yield ('_'.join(tag(have)), ids(have),
+ '_'.join(tag(give)), ids(give),
+ i == len(self.__likely))
+
+ def defaultMap(self):
+ """Map language and script to their default country by ID.
+
+ Yields ((language, script), country) wherever the likely
+ sub-tags mapping says language's default locale uses the given
+ script and country."""
+ for have, give in self.__likely:
+ if have[1:] == ('AnyScript', 'AnyCountry') and give[2] != 'AnyCountry':
+ assert have[0] == give[0], (have, give)
+ yield ((self.__langByName[give[0]][0],
+ self.__textByName[give[1]][0]),
+ self.__landByName[give[2]][0])
+
+ # Implementation details:
+ def __loadMap(self, category):
+ kid = self.__firstChildText
+ for element in self.__eachEltInGroup(self.root, category + 'List', category):
+ yield int(kid(element, 'id')), kid(element, 'name'), kid(element, 'code')
+
+ def __likelySubtagsMap(self):
+ def triplet(element, keys=('language', 'script', 'country'), kid = self.__firstChildText):
+ return tuple(kid(element, key) for key in keys)
+
+ kid = self.__firstChildElt
+ for elt in self.__eachEltInGroup(self.root, 'likelySubtags', 'likelySubtag'):
+ yield triplet(kid(elt, "from")), triplet(kid(elt, "to"))
+
+ def __fromNames(self, names):
+ return self.__langByName[names[0]], self.__textByName[names[1]], self.__landByName[names[2]]
+
+ # DOM access:
+ from xml.dom import minidom
+ @staticmethod
+ def __parse(filename, read = minidom.parse):
+ return read(filename).documentElement
+
+ @staticmethod
+ def __isNodeNamed(elt, name, TYPE=minidom.Node.ELEMENT_NODE):
+ return elt.nodeType == TYPE and elt.nodeName == name
+ del minidom
+
+ @staticmethod
+ def __eltWords(elt):
+ child = elt.firstChild
+ while child:
+ if child.nodeType == elt.TEXT_NODE:
+ yield child.nodeValue
+ child = child.nextSibling
+
+ @classmethod
+ def __firstChildElt(cls, parent, name):
+ child = parent.firstChild
+ while child:
+ if cls.__isNodeNamed(child, name):
+ return child
+ child = child.nextSibling
+
+ raise Error('No {} child found'.format(name))
+
+ @classmethod
+ def __firstChildText(cls, elt, key):
+ return ' '.join(cls.__eltWords(cls.__firstChildElt(elt, key)))
+
+ @classmethod
+ def __eachEltInGroup(cls, parent, group, key):
+ try:
+ element = cls.__firstChildElt(parent, group).firstChild
+ except Error:
+ element = None
+
+ while element:
+ if cls.__isNodeNamed(element, key):
+ yield element
+ element = element.nextSibling
+
+
+class Spacer (object):
+ def __init__(self, indent = None, initial = ''):
+ """Prepare to manage indentation and line breaks.
+
+ Arguments are both optional.
+
+ First argument, indent, is either None (its default, for
+ 'minifying'), an ingeter (number of spaces) or the unit of
+ text that is to be used for each indentation level (e.g. '\t'
+ to use tabs). If indent is None, no indentation is added, nor
+ are line-breaks; otherwise, self(text), for non-empty text,
+ shall end with a newline and begin with indentation.
+
+ Second argument, initial, is the initial indentation; it is
+ ignored if indent is None. Indentation increases after each
+ call to self(text) in which text starts with a tag and doesn't
+ include its end-tag; indentation decreases if text starts with
+ an end-tag. The text is not parsed any more carefully than
+ just described.
+ """
+ if indent is None:
+ self.__call = lambda x: x
+ else:
+ self.__each = ' ' * indent if isinstance(indent, int) else indent
+ self.current = initial
+ self.__call = self.__wrap
+
+ def __wrap(self, line):
+ if not line:
+ return '\n'
+
+ indent = self.current
+ if line.startswith('</'):
+ indent = self.current = indent[:-len(self.__each)]
+ elif line.startswith('<') and not line.startswith('<!'):
+ cut = line.find('>')
+ tag = (line[1:] if cut < 0 else line[1 : cut]).strip().split()[0]
+ if '</{}>'.format(tag) not in line:
+ self.current += self.__each
+ return indent + line + '\n'
+
+ def __call__(self, line):
+ return self.__call(line)
+
+class QLocaleXmlWriter (object):
+ def __init__(self, save = None, space = Spacer(4)):
+ """Set up to write digested CLDR data as QLocale XML.
+
+ Arguments are both optional.
+
+ First argument, save, is None (its default) or a callable that
+ will write content to where you intend to save it. If None, it
+ is replaced with a callable that prints the given content,
+ suppressing the newline (but see the following); this is
+ equivalent to passing sys.stdout.write.
+
+ Second argument, space, is an object to call on each text
+ output to prepend indentation and append newlines, or not as
+ the case may be. The default is a Spacer(4), which grows
+ indent by four spaces after each unmatched new tag and shrinks
+ back on a close-tag (its parsing is naive, but adequate to how
+ this class uses it), while adding a newline to each line.
+ """
+ self.__rawOutput = self.__printit if save is None else save
+ self.__wrap = space
+ self.__write('<localeDatabase>')
+
+ # Output of various sections, in their usual order:
+ def enumData(self, languages, scripts, countries):
+ self.__enumTable('languageList', languages)
+ self.__enumTable('scriptList', scripts)
+ self.__enumTable('countryList', countries)
+
+ def likelySubTags(self, entries):
+ self.__openTag('likelySubtags')
+ for have, give in entries:
+ self.__openTag('likelySubtag')
+ self.__likelySubTag('from', have)
+ self.__likelySubTag('to', give)
+ self.__closeTag('likelySubtag')
+ self.__closeTag('likelySubtags')
+
+ def locales(self, locales, calendars):
+ self.__openTag('localeList')
+ self.__openTag('locale')
+ Locale.C(calendars).toXml(self.inTag, calendars)
+ self.__closeTag('locale')
+ keys = locales.keys()
+ keys.sort()
+ for key in keys:
+ self.__openTag('locale')
+ locales[key].toXml(self.inTag, calendars)
+ self.__closeTag('locale')
+ self.__closeTag('localeList')
+
+ def version(self, cldrVersion):
+ self.inTag('version', cldrVersion)
+
+ def inTag(self, tag, text):
+ self.__write('<{0}>{1}</{0}>'.format(tag, text))
+
+ def close(self):
+ if self.__rawOutput != self.__complain:
+ self.__write('</localeDatabase>')
+ self.__rawOutput = self.__complain
+
+ # Implementation details
+ @staticmethod
+ def __printit(text):
+ print(text, end='')
+ @staticmethod
+ def __complain(text):
+ raise Error('Attempted to write data after closing :-(')
+
+ def __enumTable(self, tag, table):
+ self.__openTag(tag)
+ for key, value in table.iteritems():
+ self.__openTag(tag[:-4])
+ self.inTag('name', value[0])
+ self.inTag('id', key)
+ self.inTag('code', value[1])
+ self.__closeTag(tag[:-4])
+ self.__closeTag(tag)
+
+ def __likelySubTag(self, tag, likely):
+ self.__openTag(tag)
+ self.inTag('language', likely[0])
+ self.inTag('script', likely[1])
+ self.inTag('country', likely[2])
+ # self.inTag('variant', likely[3])
+ self.__closeTag(tag)
+
+ def __openTag(self, tag):
+ self.__write('<{}>'.format(tag))
+ def __closeTag(self, tag):
+ self.__write('</{}>'.format(tag))
+
+ def __write(self, line):
+ self.__rawOutput(self.__wrap(line))
+
+class Locale (object):
+ """Holder for the assorted data representing one locale.
+
+ Implemented as a namespace; its constructor and update() have the
+ same signatures as those of a dict, acting on the instance's
+ __dict__, so the results are accessed as attributes rather than
+ mapping keys."""
+ def __init__(self, data=None, **kw):
+ self.update(data, **kw)
+
+ def update(self, data=None, **kw):
+ if data: self.__dict__.update(data)
+ if kw: self.__dict__.update(kw)
+
+ def __len__(self): # Used when testing as a boolean
+ return len(self.__dict__)
+
@staticmethod
def propsMonthDay(scale, lengths=('long', 'short', 'narrow')):
for L in lengths:
@@ -158,16 +478,24 @@ class Locale:
return cls(data)
- def toXml(self, calendars=('gregorian',), indent=' ', tab=' '):
- print indent + '<locale>'
- inner = indent + tab
+ def toXml(self, write, calendars=('gregorian',)):
+ """Writes its data as QLocale XML.
+
+ First argument, write, is a callable taking the name and
+ content of an XML element; it is expected to be the inTag
+ bound method of a QLocaleXmlWriter instance.
+
+ Optional second argument is a list of calendar names, in the
+ form used by CLDR; its default is ('gregorian',).
+ """
get = lambda k: getattr(self, k)
for key in ('language', 'script', 'country'):
- print inner + "<%s>" % key + get(key) + "</%s>" % key
- print inner + "<%scode>" % key + get(key + '_code') + "</%scode>" % key
+ write(key, get(key))
+ write('{}code'.format(key), get('{}_code'.format(key)))
- for key in ('decimal', 'group', 'zero', 'list', 'percent', 'minus', 'plus', 'exp'):
- print inner + "<%s>" % key + get(key) + "</%s>" % key
+ for key in ('decimal', 'group', 'zero', 'list',
+ 'percent', 'minus', 'plus', 'exp'):
+ write(key, get(key))
for key in ('languageEndonym', 'countryEndonym',
'quotationStart', 'quotationEnd',
@@ -185,16 +513,10 @@ class Locale:
'_'.join((k, cal))
for k in self.propsMonthDay('months')
for cal in calendars):
- print inner + "<%s>%s</%s>" % (key, escape(get(key)).encode('utf-8'), key)
+ write(key, escape(get(key)).encode('utf-8'))
for key in ('currencyDigits', 'currencyRounding'):
- print inner + "<%s>%d</%s>" % (key, get(key), key)
-
- print indent + "</locale>"
-
- def __init__(self, data=None, **kw):
- if data: self.__dict__.update(data)
- if kw: self.__dict__.update(kw)
+ write(key, get(key))
# Tools used by __monthNames:
def fullName(i, name): return name
@@ -213,6 +535,9 @@ class Locale:
@staticmethod
def __monthNames(calendars,
known={ # Map calendar to (names, extractors...):
+ # TODO: do we even need these ? CLDR's root.xml seems to
+ # have them, complete with yeartype="leap" handling for
+ # Hebrew's extra.
'gregorian': (('January', 'February', 'March', 'April', 'May', 'June', 'July',
'August', 'September', 'October', 'November', 'December'),
# Extractor pairs, (plain, standalone)
@@ -240,8 +565,8 @@ class Locale:
for cal in calendars:
try:
data = known[cal]
- except KeyError: # Need to add an entry to known, above.
- print 'Unsupported calendar:', cal
+ except KeyError as e: # Need to add an entry to known, above.
+ e.args += ('Unsupported calendar:', cal)
raise
names, get = data[0], data[1:]
for n, size in enumerate(sizes):
@@ -253,12 +578,11 @@ class Locale:
@classmethod
def C(cls, calendars=('gregorian',),
- # Empty entry at end to ensure final separator when join()ed:
days = ('Sunday', 'Monday', 'Tuesday', 'Wednesday',
'Thursday', 'Friday', 'Saturday'),
quantifiers=('k', 'M', 'G', 'T', 'P', 'E')):
"""Returns an object representing the C locale."""
- return cls(dict(cls.__monthNames(calendars)),
+ return cls(cls.__monthNames(calendars),
language='C', language_code='0', languageEndonym='',
script='AnyScript', script_code='0',
country='AnyCountry', country_code='0', countryEndonym='',
diff --git a/util/locale_database/qlocalexml2cpp.py b/util/locale_database/qlocalexml2cpp.py
index 3dde298f47..db45ab2778 100755
--- a/util/locale_database/qlocalexml2cpp.py
+++ b/util/locale_database/qlocalexml2cpp.py
@@ -34,238 +34,53 @@ the root of the qtbase check-out as second parameter.
"""
import os
-import sys
-import tempfile
import datetime
-import xml.dom.minidom
-from enumdata import language_aliases, country_aliases, script_aliases
-from qlocalexml import Locale
+from qlocalexml import QLocaleXmlReader
+from xml.dom import minidom
+from localetools import unicode2hex, wrap_list, Error, Transcriber, SourceFileEditor
-# TODO: Make calendars a command-line parameter
-# map { CLDR name: Qt file name }
-calendars = {'gregorian': 'roman', 'persian': 'jalali', 'islamic': 'hijri',} # 'hebrew': 'hebrew',
-
-generated_template = """
-/*
- This part of the file was generated on %s from the
- Common Locale Data Repository v%s
-
- http://www.unicode.org/cldr/
-
- Do not edit this section: instead regenerate it using
- cldr2qlocalexml.py and qlocalexml2cpp.py on updated (or
- edited) CLDR data; see qtbase/util/locale_database/.
-*/
-
-"""
-
-class Error:
- def __init__(self, msg):
- self.msg = msg
- def __str__(self):
- return self.msg
-
-def wrap_list(lst):
- def split(lst, size):
- while lst:
- head, lst = lst[:size], lst[size:]
- yield head
- return ",\n".join(", ".join(x) for x in split(lst, 20))
-
-def isNodeNamed(elt, name, TYPE=xml.dom.minidom.Node.ELEMENT_NODE):
- return elt.nodeType == TYPE and elt.nodeName == name
-
-def firstChildElt(parent, name):
- child = parent.firstChild
- while child:
- if isNodeNamed(child, name):
- return child
- child = child.nextSibling
+def compareLocaleKeys(key1, key2):
+ if key1 == key2:
+ return 0
- raise Error('No %s child found' % name)
+ if key1[0] != key2[0]: # First sort by language:
+ return key1[0] - key2[0]
-def eachEltInGroup(parent, group, key):
+ defaults = compareLocaleKeys.default_map
+ # maps {(language, script): country} by ID
try:
- element = firstChildElt(parent, group).firstChild
- except Error:
- element = None
-
- while element:
- if isNodeNamed(element, key):
- yield element
- element = element.nextSibling
-
-def eltWords(elt):
- child = elt.firstChild
- while child:
- if child.nodeType == elt.TEXT_NODE:
- yield child.nodeValue
- child = child.nextSibling
-
-def firstChildText(elt, key):
- return ' '.join(eltWords(firstChildElt(elt, key)))
-
-def loadMap(doc, category):
- return dict((int(firstChildText(element, 'id')),
- (firstChildText(element, 'name'),
- firstChildText(element, 'code')))
- for element in eachEltInGroup(doc.documentElement,
- category + 'List', category))
-
-def loadLikelySubtagsMap(doc):
- def triplet(element, keys=('language', 'script', 'country')):
- return tuple(firstChildText(element, key) for key in keys)
-
- return dict((i, {'from': triplet(firstChildElt(elt, "from")),
- 'to': triplet(firstChildElt(elt, "to"))})
- for i, elt in enumerate(eachEltInGroup(doc.documentElement,
- 'likelySubtags', 'likelySubtag')))
-
-def fixedScriptName(name, dupes):
- # Don't .capitalize() as some names are already camel-case (see enumdata.py):
- name = ''.join(word[0].upper() + word[1:] for word in name.split())
- if name[-6:] != "Script":
- name = name + "Script"
- if name in dupes:
- sys.stderr.write("\n\n\nERROR: The script name '%s' is messy" % name)
- sys.exit(1)
- return name
-
-def fixedCountryName(name, dupes):
- if name in dupes:
- return name.replace(" ", "") + "Country"
- return name.replace(" ", "")
-
-def fixedLanguageName(name, dupes):
- if name in dupes:
- return name.replace(" ", "") + "Language"
- return name.replace(" ", "")
-
-def findDupes(country_map, language_map):
- country_set = set(v[0] for a, v in country_map.iteritems())
- language_set = set(v[0] for a, v in language_map.iteritems())
- return country_set & language_set
-
-def languageNameToId(name, language_map):
- for key in language_map.keys():
- if language_map[key][0] == name:
- return key
- return -1
-
-def scriptNameToId(name, script_map):
- for key in script_map.keys():
- if script_map[key][0] == name:
- return key
- return -1
-
-def countryNameToId(name, country_map):
- for key in country_map.keys():
- if country_map[key][0] == name:
- return key
- return -1
-
-def loadLocaleMap(doc, language_map, script_map, country_map, likely_subtags_map):
- result = {}
-
- for locale_elt in eachEltInGroup(doc.documentElement, "localeList", "locale"):
- locale = Locale.fromXmlData(lambda k: firstChildText(locale_elt, k), calendars.keys())
- language_id = languageNameToId(locale.language, language_map)
- if language_id == -1:
- sys.stderr.write("Cannot find a language id for '%s'\n" % locale.language)
- script_id = scriptNameToId(locale.script, script_map)
- if script_id == -1:
- sys.stderr.write("Cannot find a script id for '%s'\n" % locale.script)
- country_id = countryNameToId(locale.country, country_map)
- if country_id == -1:
- sys.stderr.write("Cannot find a country id for '%s'\n" % locale.country)
-
- if language_id != 1: # C
- if country_id == 0:
- sys.stderr.write("loadLocaleMap: No country id for '%s'\n" % locale.language)
-
- if script_id == 0:
- # find default script for a given language and country (see http://www.unicode.org/reports/tr35/#Likely_Subtags)
- for key in likely_subtags_map.keys():
- tmp = likely_subtags_map[key]
- if tmp["from"][0] == locale.language and tmp["from"][1] == "AnyScript" and tmp["from"][2] == locale.country:
- locale.script = tmp["to"][1]
- script_id = scriptNameToId(locale.script, script_map)
- break
- if script_id == 0 and country_id != 0:
- # try with no country
- for key in likely_subtags_map.keys():
- tmp = likely_subtags_map[key]
- if tmp["from"][0] == locale.language and tmp["from"][1] == "AnyScript" and tmp["from"][2] == "AnyCountry":
- locale.script = tmp["to"][1]
- script_id = scriptNameToId(locale.script, script_map)
- break
-
- result[(language_id, script_id, country_id)] = locale
-
- return result
+ country = defaults[key1[:2]]
+ except KeyError:
+ pass
+ else:
+ if key1[2] == country:
+ return -1
+ if key2[2] == country:
+ return 1
-def compareLocaleKeys(key1, key2):
- if key1 == key2:
- return 0
+ if key1[1] == key2[1]:
+ return key1[2] - key2[2]
- if key1[0] == key2[0]:
- l1 = compareLocaleKeys.locale_map[key1]
- l2 = compareLocaleKeys.locale_map[key2]
-
- if (l1.language, l1.script) in compareLocaleKeys.default_map.keys():
- default = compareLocaleKeys.default_map[(l1.language, l1.script)]
- if l1.country == default:
- return -1
- if l2.country == default:
- return 1
-
- if key1[1] != key2[1]:
- if (l2.language, l2.script) in compareLocaleKeys.default_map.keys():
- default = compareLocaleKeys.default_map[(l2.language, l2.script)]
- if l2.country == default:
- return 1
- if l1.country == default:
- return -1
-
- if key1[1] != key2[1]:
- return key1[1] - key2[1]
+ try:
+ country = defaults[key2[:2]]
+ except KeyError:
+ pass
else:
- return key1[0] - key2[0]
+ if key2[2] == country:
+ return 1
+ if key1[2] == country:
+ return -1
+
+ return key1[1] - key2[1]
- return key1[2] - key2[2]
-
-
-def languageCount(language_id, locale_map):
- result = 0
- for key in locale_map.keys():
- if key[0] == language_id:
- result += 1
- return result
-
-def unicode2hex(s):
- lst = []
- for x in s:
- v = ord(x)
- if v > 0xFFFF:
- # make a surrogate pair
- # copied from qchar.h
- high = (v >> 10) + 0xd7c0
- low = (v % 0x400 + 0xdc00)
- lst.append(hex(high))
- lst.append(hex(low))
- else:
- lst.append(hex(v))
- return lst
class StringDataToken:
def __init__(self, index, length, bits):
if index > 0xffff:
- print "\n\n\n#error Data index is too big!", index
- raise ValueError("Start-index (%d) exceeds the uint16 range!" % index)
+ raise ValueError('Start-index ({}) exceeds the uint16 range!'.format(index))
if length >= (1 << bits):
- print "\n\n\n#error Range length is too big!", length
- raise ValueError("Data size (%d) exceeds the %d-bit range!" % (length, bits))
+ raise ValueError('Data size ({}) exceeds the {}-bit range!'.format(length, bits))
self.index = index
self.length = length
@@ -277,7 +92,7 @@ class StringData:
self.name = name
self.text = '' # Used in quick-search for matches in data
- def append(self, s, bits=8):
+ def append(self, s, bits = 8):
try:
token = self.hash[s]
except KeyError:
@@ -317,592 +132,481 @@ class StringData:
def write(self, fd):
if len(self.data) > 0xffff:
- raise ValueError("Data is too big for quint16 index to its end!" % len(self.data),
+ raise ValueError('Data is too big ({}) for quint16 index to its end!'
+ .format(len(self.data)),
self.name)
- fd.write("\nstatic const char16_t %s[] = {\n" % self.name)
+ fd.write("\nstatic const char16_t {}[] = {{\n".format(self.name))
fd.write(wrap_list(self.data))
fd.write("\n};\n")
-def escapedString(s):
- result = ""
- i = 0
- while i < len(s):
- if s[i] == '"':
- result += '\\"'
- i += 1
- else:
- result += s[i]
- i += 1
- s = result
-
- line = ""
- need_escape = False
- result = ""
- for c in s:
- if ord(c) < 128 and not (need_escape and ord('a') <= ord(c.lower()) <= ord('f')):
- line += c
- need_escape = False
- else:
- line += "\\x%02x" % (ord(c))
- need_escape = True
- if len(line) > 80:
- result = result + "\n" + '"' + line + '"'
- line = ""
- line += "\\0"
- result = result + "\n" + '"' + line + '"'
- if result[0] == "\n":
- result = result[1:]
- return result
-
-def printEscapedString(s):
- print escapedString(s)
-
def currencyIsoCodeData(s):
if s:
return '{' + ",".join(str(ord(x)) for x in s) + '}'
return "{0,0,0}"
-def usage():
- print "Usage: qlocalexml2cpp.py <path-to-locale.xml> <path-to-qtbase-src-tree>"
- sys.exit(1)
+class LocaleSourceEditor (SourceFileEditor):
+ __upinit = SourceFileEditor.__init__
+ def __init__(self, path, temp, version):
+ self.__upinit(path, temp)
+ self.writer.write("""
+/*
+ This part of the file was generated on {} from the
+ Common Locale Data Repository v{}
+
+ http://www.unicode.org/cldr/
+
+ Do not edit this section: instead regenerate it using
+ cldr2qlocalexml.py and qlocalexml2cpp.py on updated (or
+ edited) CLDR data; see qtbase/util/locale_database/.
+*/
-GENERATED_BLOCK_START = "// GENERATED PART STARTS HERE\n"
-GENERATED_BLOCK_END = "// GENERATED PART ENDS HERE\n"
+""".format(datetime.date.today(), version))
+
+class LocaleDataWriter (LocaleSourceEditor):
+ def likelySubtags(self, likely):
+ self.writer.write('static const QLocaleId likely_subtags[] = {\n')
+ for had, have, got, give, last in likely:
+ self.writer.write(' {{ {:3d}, {:3d}, {:3d} }}'.format(*have))
+ self.writer.write(', {{ {:3d}, {:3d}, {:3d} }}'.format(*give))
+ self.writer.write(' ' if last else ',')
+ self.writer.write(' // {} -> {}\n'.format(had, got))
+ self.writer.write('};\n\n')
+
+ def localeIndex(self, indices):
+ self.writer.write('static const quint16 locale_index[] = {\n')
+ for pair in indices:
+ self.writer.write('{:6d}, // {}\n'.format(*pair))
+ self.writer.write(' 0 // trailing 0\n')
+ self.writer.write('};\n\n')
+
+ def localeData(self, locales, names):
+ list_pattern_part_data = StringData('list_pattern_part_data')
+ single_character_data = StringData('single_character_data')
+ date_format_data = StringData('date_format_data')
+ time_format_data = StringData('time_format_data')
+ days_data = StringData('days_data')
+ am_data = StringData('am_data')
+ pm_data = StringData('pm_data')
+ byte_unit_data = StringData('byte_unit_data')
+ currency_symbol_data = StringData('currency_symbol_data')
+ currency_display_name_data = StringData('currency_display_name_data')
+ currency_format_data = StringData('currency_format_data')
+ endonyms_data = StringData('endonyms_data')
+
+ # Locale data
+ self.writer.write('static const QLocaleData locale_data[] = {\n')
+ # Table headings: keep each label centred in its field, matching line_format:
+ self.writer.write(' // '
+ # Width 6 + comma
+ ' lang ' # IDs
+ 'script '
+ ' terr '
+
+ # Range entries (all start-indices, then all sizes)
+ # Width 5 + comma
+ 'lStrt ' # List pattern
+ 'lpMid '
+ 'lpEnd '
+ 'lPair '
+ 'lDelm ' # List delimiter
+ # Representing numbers
+ ' dec '
+ 'group '
+ 'prcnt '
+ ' zero '
+ 'minus '
+ 'plus '
+ ' exp '
+ # Quotation marks
+ 'qtOpn '
+ 'qtEnd '
+ 'altQO '
+ 'altQE '
+ 'lDFmt ' # Date format
+ 'sDFmt '
+ 'lTFmt ' # Time format
+ 'sTFmt '
+ 'slDay ' # Day names
+ 'lDays '
+ 'ssDys '
+ 'sDays '
+ 'snDay '
+ 'nDays '
+ ' am ' # am/pm indicators
+ ' pm '
+ ' byte '
+ 'siQnt '
+ 'iecQn '
+ 'crSym ' # Currency formatting
+ 'crDsp '
+ 'crFmt '
+ 'crFNg '
+ 'ntLng ' # Name of language in itself, and of territory
+ 'ntTer '
+ # Width 3 + comma for each size; no header
+ + ' ' * 37 +
+
+ # Strays (char array, bit-fields):
+ # Width 10 + 2 spaces + comma
+ ' currISO '
+ # Width 6 + comma
+ 'curDgt ' # Currency digits
+ 'curRnd ' # Currencty rounding (unused: QTBUG-81343)
+ 'dow1st ' # First day of week
+ ' wknd+ ' # Week-end start/end days
+ ' wknd-'
+ # No trailing space on last entry (be sure to
+ # pad before adding anything after it).
+ '\n')
+
+ formatLine = ''.join((
+ ' {{ ',
+ # Locale-identifier
+ '{:6d},' * 3,
+ # List patterns, date/time formats, day names, am/pm
+ # SI/IEC byte-unit abbreviations
+ # Currency and endonyms
+ # Range starts
+ '{:5d},' * 37,
+ # Range sizes
+ '{:3d},' * 37,
+
+ # Currency ISO code
+ ' {:>10s}, ',
+ # Currency formatting
+ '{:6d},{:6d}',
+ # Day of week and week-end
+ ',{:6d}' * 3,
+ ' }}')).format
+ for key in names:
+ locale = locales[key]
+ # Sequence of StringDataToken:
+ ranges = (tuple(list_pattern_part_data.append(p) for p in # 5 entries:
+ (locale.listPatternPartStart, locale.listPatternPartMiddle,
+ locale.listPatternPartEnd, locale.listPatternPartTwo,
+ locale.listDelim)) +
+ tuple(single_character_data.append(p) for p in # 11 entries
+ (locale.decimal, locale.group, locale.percent, locale.zero,
+ locale.minus, locale.plus, locale.exp,
+ locale.quotationStart, locale.quotationEnd,
+ locale.alternateQuotationStart, locale.alternateQuotationEnd)) +
+ tuple (date_format_data.append(f) for f in # 2 entries:
+ (locale.longDateFormat, locale.shortDateFormat)) +
+ tuple(time_format_data.append(f) for f in # 2 entries:
+ (locale.longTimeFormat, locale.shortTimeFormat)) +
+ tuple(days_data.append(d) for d in # 6 entries:
+ (locale.standaloneLongDays, locale.longDays,
+ locale.standaloneShortDays, locale.shortDays,
+ locale.standaloneNarrowDays, locale.narrowDays)) +
+ (am_data.append(locale.am), pm_data.append(locale.pm)) + # 2 entries
+ tuple(byte_unit_data.append(b) for b in # 3 entries:
+ (locale.byte_unit,
+ locale.byte_si_quantified,
+ locale.byte_iec_quantified)) +
+ (currency_symbol_data.append(locale.currencySymbol),
+ currency_display_name_data.append(locale.currencyDisplayName),
+ currency_format_data.append(locale.currencyFormat),
+ currency_format_data.append(locale.currencyNegativeFormat),
+ endonyms_data.append(locale.languageEndonym),
+ endonyms_data.append(locale.countryEndonym)) # 6 entries
+ ) # Total: 37 entries
+ assert len(ranges) == 37
+
+ self.writer.write(formatLine(*(
+ key +
+ tuple(r.index for r in ranges) +
+ tuple(r.length for r in ranges) +
+ (currencyIsoCodeData(locale.currencyIsoCode),
+ locale.currencyDigits,
+ locale.currencyRounding, # unused (QTBUG-81343)
+ locale.firstDayOfWeek,
+ locale.weekendStart,
+ locale.weekendEnd) ))
+ + ', // {}/{}/{}\n'.format(
+ locale.language, locale.script, locale.country))
+ self.writer.write(formatLine(*( # All zeros, matching the format:
+ (0,) * 3 + (0,) * 37 * 2
+ + (currencyIsoCodeData(0),)
+ + (0,) * 2
+ + (0,) * 3 ))
+ + ' // trailing zeros\n')
+ self.writer.write('};\n')
+
+ # StringData tables:
+ for data in (list_pattern_part_data, single_character_data,
+ date_format_data, time_format_data, days_data,
+ byte_unit_data, am_data, pm_data, currency_symbol_data,
+ currency_display_name_data, currency_format_data,
+ endonyms_data):
+ data.write(self.writer)
+
+ @staticmethod
+ def __writeNameData(out, book, form):
+ out('static const char {}_name_list[] =\n'.format(form))
+ out('"Default\\0"\n')
+ for key, value in book.items():
+ if key == 0:
+ continue
+ out('"' + value[0] + '\\0"\n')
+ out(';\n\n')
+
+ out('static const quint16 {}_name_index[] = {{\n'.format(form))
+ out(' 0, // Any{}\n'.format(form.capitalize()))
+ index = 8
+ for key, value in book.items():
+ if key == 0:
+ continue
+ name = value[0]
+ out('{:6d}, // {}\n'.format(index, name))
+ index += len(name) + 1
+ out('};\n\n')
+
+ @staticmethod
+ def __writeCodeList(out, book, form, width):
+ out('static const unsigned char {}_code_list[] =\n'.format(form))
+ for key, value in book.items():
+ code = value[1]
+ code += r'\0' * max(width - len(code), 0)
+ out('"{}" // {}\n'.format(code, value[0]))
+ out(';\n\n')
+
+ def languageNames(self, languages):
+ self.__writeNameData(self.writer.write, languages, 'language')
+
+ def scriptNames(self, scripts):
+ self.__writeNameData(self.writer.write, scripts, 'script')
+
+ def countryNames(self, countries):
+ self.__writeNameData(self.writer.write, countries, 'country')
+
+ # TODO: unify these next three into the previous three; kept
+ # separate for now to verify we're not changing data.
+
+ def languageCodes(self, languages):
+ self.__writeCodeList(self.writer.write, languages, 'language', 3)
+
+ def scriptCodes(self, scripts):
+ self.__writeCodeList(self.writer.write, scripts, 'script', 4)
+
+ def countryCodes(self, countries): # TODO: unify with countryNames()
+ self.__writeCodeList(self.writer.write, countries, 'country', 3)
+
+class CalendarDataWriter (LocaleSourceEditor):
+ formatCalendar = (
+ ' {{'
+ + ','.join(('{:6d}',) * 3 + ('{:5d}',) * 6 + ('{:3d}',) * 6)
+ + ' }},').format
+ def write(self, calendar, locales, names):
+ months_data = StringData('months_data')
-def main():
- if len(sys.argv) != 3:
- usage()
+ self.writer.write('static const QCalendarLocale locale_data[] = {\n')
+ self.writer.write(
+ ' //'
+ # IDs, width 7 (6 + comma)
+ ' lang '
+ ' script'
+ ' terr '
+ # Month-name start-indices, width 6 (5 + comma)
+ 'sLong '
+ ' long '
+ 'sShrt '
+ 'short '
+ 'sNarw '
+ 'narow '
+ # No individual headers for the sizes.
+ 'Sizes...'
+ '\n')
+ for key in names:
+ locale = locales[key]
+ # Sequence of StringDataToken:
+ try:
+ # Twelve long month names can add up to more than 256 (e.g. kde_TZ: 264)
+ ranges = (tuple(months_data.append(m[calendar], 16) for m in
+ (locale.standaloneLongMonths, locale.longMonths)) +
+ tuple(months_data.append(m[calendar]) for m in
+ (locale.standaloneShortMonths, locale.shortMonths,
+ locale.standaloneNarrowMonths, locale.narrowMonths)))
+ except ValueError as e:
+ e.args += (locale.language, locale.script, locale.country, stem)
+ raise
- qlocalexml = sys.argv[1]
- qtsrcdir = sys.argv[2]
+ self.writer.write(
+ self.formatCalendar(*(
+ key +
+ tuple(r.index for r in ranges) +
+ tuple(r.length for r in ranges) ))
+ + '// {}/{}/{}\n'.format(locale.language, locale.script, locale.country))
+ self.writer.write(self.formatCalendar(*( (0,) * (3 + 6 * 2) ))
+ + '// trailing zeros\n')
+ self.writer.write('};\n')
+ months_data.write(self.writer)
+
+class LocaleHeaderWriter (SourceFileEditor):
+ __upinit = SourceFileEditor.__init__
+ def __init__(self, path, temp, dupes):
+ self.__upinit(path, temp)
+ self.__dupes = dupes
+
+ def languages(self, languages):
+ self.__enum('Language', languages, self.__language)
+ self.writer.write('\n')
+
+ def countries(self, countries):
+ self.__enum('Country', countries, self.__country)
+
+ def scripts(self, scripts):
+ self.__enum('Script', scripts, self.__script)
+ self.writer.write('\n')
+
+ # Implementation details
+ from enumdata import (language_aliases as __language,
+ country_aliases as __country,
+ script_aliases as __script)
+
+ def __enum(self, name, book, alias):
+ assert book
+ out, dupes = self.writer.write, self.__dupes
+ out(' enum {} {{\n'.format(name))
+ for key, value in book.items():
+ member = value[0]
+ if name == 'Script':
+ # Don't .capitalize() as some names are already camel-case (see enumdata.py):
+ member = ''.join(word[0].upper() + word[1:] for word in member.split())
+ if not member.endswith('Script'):
+ member += 'Script'
+ if member in dupes:
+ raise Error('The script name "{}" is messy'.format(member))
+ else:
+ member = ''.join(member.split())
+ member = member + name if member in dupes else member
+ out(' {} = {},\n'.format(member, key))
+
+ out('\n '
+ + ',\n '.join('{} = {}'.format(*pair)
+ for pair in sorted(alias.items()))
+ + ',\n\n Last{} = {}\n }};\n'.format(name, member))
+
+def usage(name, err, message = ''):
+ err.write("""Usage: {} path/to/qlocale.xml root/of/qtbase
+""".format(name)) # TODO: elaborate
+ if message:
+ err.write('\n' + message + '\n')
+
+def main(args, out, err):
+ # TODO: Make calendars a command-line parameter
+ # map { CLDR name: Qt file name }
+ calendars = {'gregorian': 'roman', 'persian': 'jalali', 'islamic': 'hijri',} # 'hebrew': 'hebrew',
+
+ name = args.pop(0)
+ if len(args) != 2:
+ usage(name, err, 'I expect two arguments')
+ return 1
+
+ qlocalexml = args.pop(0)
+ qtsrcdir = args.pop(0)
if not (os.path.isdir(qtsrcdir)
and all(os.path.isfile(os.path.join(qtsrcdir, 'src', 'corelib', 'text', leaf))
for leaf in ('qlocale_data_p.h', 'qlocale.h', 'qlocale.qdoc'))):
- usage()
-
- (data_temp_file, data_temp_file_path) = tempfile.mkstemp("qlocale_data_p.h", dir=qtsrcdir)
- data_temp_file = os.fdopen(data_temp_file, "w")
- qlocaledata_file = open(qtsrcdir + "/src/corelib/text/qlocale_data_p.h", "r")
- s = qlocaledata_file.readline()
- while s and s != GENERATED_BLOCK_START:
- data_temp_file.write(s)
- s = qlocaledata_file.readline()
- data_temp_file.write(GENERATED_BLOCK_START)
-
- doc = xml.dom.minidom.parse(qlocalexml)
- language_map = loadMap(doc, 'language')
- script_map = loadMap(doc, 'script')
- country_map = loadMap(doc, 'country')
- likely_subtags_map = loadLikelySubtagsMap(doc)
- default_map = {}
- for key in likely_subtags_map.keys():
- tmp = likely_subtags_map[key]
- if tmp["from"][1] == "AnyScript" and tmp["from"][2] == "AnyCountry" and tmp["to"][2] != "AnyCountry":
- default_map[(tmp["to"][0], tmp["to"][1])] = tmp["to"][2]
- locale_map = loadLocaleMap(doc, language_map, script_map, country_map, likely_subtags_map)
- dupes = findDupes(language_map, country_map)
-
- cldr_version = firstChildText(doc.documentElement, "version")
- data_temp_file.write(generated_template % (datetime.date.today(), cldr_version))
-
- # Likely subtags map
- data_temp_file.write("static const QLocaleId likely_subtags[] = {\n")
- index = 0
- for key in likely_subtags_map.keys():
- tmp = likely_subtags_map[key]
- from_language = languageNameToId(tmp["from"][0], language_map)
- from_script = scriptNameToId(tmp["from"][1], script_map)
- from_country = countryNameToId(tmp["from"][2], country_map)
- to_language = languageNameToId(tmp["to"][0], language_map)
- to_script = scriptNameToId(tmp["to"][1], script_map)
- to_country = countryNameToId(tmp["to"][2], country_map)
-
- cmnt_from = ""
- if from_language != 0:
- cmnt_from = cmnt_from + language_map[from_language][1]
- else:
- cmnt_from = cmnt_from + "und"
- if from_script != 0:
- if cmnt_from:
- cmnt_from = cmnt_from + "_"
- cmnt_from = cmnt_from + script_map[from_script][1]
- if from_country != 0:
- if cmnt_from:
- cmnt_from = cmnt_from + "_"
- cmnt_from = cmnt_from + country_map[from_country][1]
- cmnt_to = ""
- if to_language != 0:
- cmnt_to = cmnt_to + language_map[to_language][1]
- else:
- cmnt_to = cmnt_to + "und"
- if to_script != 0:
- if cmnt_to:
- cmnt_to = cmnt_to + "_"
- cmnt_to = cmnt_to + script_map[to_script][1]
- if to_country != 0:
- if cmnt_to:
- cmnt_to = cmnt_to + "_"
- cmnt_to = cmnt_to + country_map[to_country][1]
-
- data_temp_file.write(" ")
- data_temp_file.write("{ %3d, %3d, %3d }, { %3d, %3d, %3d }" %
- (from_language, from_script, from_country, to_language, to_script, to_country))
- index += 1
- if index != len(likely_subtags_map):
- data_temp_file.write(",")
- else:
- data_temp_file.write(" ")
- data_temp_file.write(" // %s -> %s\n" % (cmnt_from, cmnt_to))
- data_temp_file.write("};\n")
-
- data_temp_file.write("\n")
-
- # Locale index
- data_temp_file.write("static const quint16 locale_index[] = {\n")
- index = 0
- for key in language_map.keys():
- i = 0
- count = languageCount(key, locale_map)
- if count > 0:
- i = index
- index += count
- data_temp_file.write("%6d, // %s\n" % (i, language_map[key][0]))
- data_temp_file.write(" 0 // trailing 0\n")
- data_temp_file.write("};\n\n")
-
- list_pattern_part_data = StringData('list_pattern_part_data')
- single_character_data = StringData('single_character_data')
- date_format_data = StringData('date_format_data')
- time_format_data = StringData('time_format_data')
- days_data = StringData('days_data')
- am_data = StringData('am_data')
- pm_data = StringData('pm_data')
- byte_unit_data = StringData('byte_unit_data')
- currency_symbol_data = StringData('currency_symbol_data')
- currency_display_name_data = StringData('currency_display_name_data')
- currency_format_data = StringData('currency_format_data')
- endonyms_data = StringData('endonyms_data')
-
- # Locale data
- data_temp_file.write("static const QLocaleData locale_data[] = {\n")
- # Table headings: keep each label centred in its field, matching line_format:
- data_temp_file.write(' // '
- # Width 6 + comma:
- + ' lang ' # IDs
- + 'script '
- + ' terr '
-
- # Range entries (all start-indices, then all sizes):
- # Width 5 + comma:
- + 'lStrt ' # List pattern
- + 'lpMid '
- + 'lpEnd '
- + 'lPair '
- + 'lDelm ' # List delimiter
- # Representing numbers:
- + ' dec '
- + 'group '
- + 'prcnt '
- + ' zero '
- + 'minus '
- + 'plus '
- + ' exp '
- # Quotation marks
- + 'qtOpn '
- + 'qtEnd '
- + 'altQO '
- + 'altQE '
- + 'lDFmt ' # Date format
- + 'sDFmt '
- + 'lTFmt ' # Time format
- + 'sTFmt '
- + 'slDay ' # Day names
- + 'lDays '
- + 'ssDys '
- + 'sDays '
- + 'snDay '
- + 'nDays '
- + ' am ' # am/pm indicators
- + ' pm '
- + ' byte '
- + 'siQnt '
- + 'iecQn '
- + 'crSym ' # Currency formatting:
- + 'crDsp '
- + 'crFmt '
- + 'crFNg '
- + 'ntLng ' # Name of language in itself, and of territory:
- + 'ntTer '
- # Width 3 + comma for each size; no header
- + ' ' * 37
-
- # Strays (char array, bit-fields):
- # Width 8+4 + comma
- + ' currISO '
- # Width 6 + comma:
- + 'curDgt ' # Currency digits
- + 'curRnd ' # Currencty rounding (unused: QTBUG-81343)
- + 'dow1st ' # First day of week
- + ' wknd+ ' # Week-end start/end days:
- + ' wknd-'
- # No trailing space on last entry (be sure to
- # pad before adding anything after it).
- + '\n')
+ usage(name, err, 'Missing expected files under qtbase source root ' + qtsrcdir)
+ return 1
+
+ reader = QLocaleXmlReader(qlocalexml)
+ locale_map = dict(reader.loadLocaleMap(calendars, err.write))
locale_keys = locale_map.keys()
- compareLocaleKeys.default_map = default_map
- compareLocaleKeys.locale_map = locale_map
+ compareLocaleKeys.default_map = dict(reader.defaultMap())
locale_keys.sort(compareLocaleKeys)
- line_format = (' { '
- # Locale-identifier:
- + '%6d,' * 3
- # Offsets for starts of ranges:
- + '%5d,' * 37
- # Sizes for the same:
- + '%3d,' * 37
-
- # Currency ISO code:
- + ' %10s, '
- # Currency formatting:
- + '%6d,%6d'
- # Day of week and week-end:
- + ',%6d' * 3
- + ' }')
- for key in locale_keys:
- l = locale_map[key]
- # Sequence of StringDataToken:
- ranges = (tuple(list_pattern_part_data.append(p) for p in # 5 entries:
- (l.listPatternPartStart, l.listPatternPartMiddle,
- l.listPatternPartEnd, l.listPatternPartTwo, l.listDelim)) +
- tuple(single_character_data.append(p) for p in # 11 entries
- (l.decimal, l.group, l.percent, l.zero, l.minus, l.plus, l.exp,
- l.quotationStart, l.quotationEnd,
- l.alternateQuotationStart, l.alternateQuotationEnd)) +
- tuple (date_format_data.append(f) for f in # 2 entries:
- (l.longDateFormat, l.shortDateFormat)) +
- tuple(time_format_data.append(f) for f in # 2 entries:
- (l.longTimeFormat, l.shortTimeFormat)) +
- tuple(days_data.append(d) for d in # 6 entries:
- (l.standaloneLongDays, l.longDays,
- l.standaloneShortDays, l.shortDays,
- l.standaloneNarrowDays, l.narrowDays)) +
- (am_data.append(l.am), pm_data.append(l.pm)) + # 2 entries:
- tuple(byte_unit_data.append(b) for b in # 3 entries:
- (l.byte_unit, l.byte_si_quantified, l.byte_iec_quantified)) +
- (currency_symbol_data.append(l.currencySymbol),
- currency_display_name_data.append(l.currencyDisplayName),
- currency_format_data.append(l.currencyFormat),
- currency_format_data.append(l.currencyNegativeFormat),
- endonyms_data.append(l.languageEndonym),
- endonyms_data.append(l.countryEndonym)) # 6 entries
- ) # Total: 37 entries
- assert len(ranges) == 37
-
- data_temp_file.write(line_format
- % ((key[0], key[1], key[2]) +
- tuple(r.index for r in ranges) +
- tuple(r.length for r in ranges) +
- (currencyIsoCodeData(l.currencyIsoCode),
- l.currencyDigits,
- l.currencyRounding, # unused (QTBUG-81343)
- l.firstDayOfWeek,
- l.weekendStart,
- l.weekendEnd))
- + ", // %s/%s/%s\n" % (l.language, l.script, l.country))
- data_temp_file.write(line_format # All zeros, matching the format:
- % ( (0,) * 3 + (0,) * 37 * 2
- + (currencyIsoCodeData(0),)
- + (0,) * 2
- + (0,) * 3)
- + " // trailing zeros\n")
- data_temp_file.write("};\n")
-
- # StringData tables:
- for data in (list_pattern_part_data, single_character_data,
- date_format_data, time_format_data, days_data,
- byte_unit_data, am_data, pm_data, currency_symbol_data,
- currency_display_name_data, currency_format_data,
- endonyms_data):
- data.write(data_temp_file)
-
- data_temp_file.write("\n")
-
- # Language name list
- data_temp_file.write("static const char language_name_list[] =\n")
- data_temp_file.write('"Default\\0"\n')
- for key in language_map.keys():
- if key == 0:
- continue
- data_temp_file.write('"' + language_map[key][0] + '\\0"\n')
- data_temp_file.write(";\n")
-
- data_temp_file.write("\n")
-
- # Language name index
- data_temp_file.write("static const quint16 language_name_index[] = {\n")
- data_temp_file.write(" 0, // AnyLanguage\n")
- index = 8
- for key in language_map.keys():
- if key == 0:
- continue
- language = language_map[key][0]
- data_temp_file.write("%6d, // %s\n" % (index, language))
- index += len(language) + 1
- data_temp_file.write("};\n")
-
- data_temp_file.write("\n")
-
- # Script name list
- data_temp_file.write("static const char script_name_list[] =\n")
- data_temp_file.write('"Default\\0"\n')
- for key in script_map.keys():
- if key == 0:
- continue
- data_temp_file.write('"' + script_map[key][0] + '\\0"\n')
- data_temp_file.write(";\n")
-
- data_temp_file.write("\n")
-
- # Script name index
- data_temp_file.write("static const quint16 script_name_index[] = {\n")
- data_temp_file.write(" 0, // AnyScript\n")
- index = 8
- for key in script_map.keys():
- if key == 0:
- continue
- script = script_map[key][0]
- data_temp_file.write("%6d, // %s\n" % (index, script))
- index += len(script) + 1
- data_temp_file.write("};\n")
-
- data_temp_file.write("\n")
-
- # Country name list
- data_temp_file.write("static const char country_name_list[] =\n")
- data_temp_file.write('"Default\\0"\n')
- for key in country_map.keys():
- if key == 0:
- continue
- data_temp_file.write('"' + country_map[key][0] + '\\0"\n')
- data_temp_file.write(";\n")
-
- data_temp_file.write("\n")
-
- # Country name index
- data_temp_file.write("static const quint16 country_name_index[] = {\n")
- data_temp_file.write(" 0, // AnyCountry\n")
- index = 8
- for key in country_map.keys():
- if key == 0:
- continue
- country = country_map[key][0]
- data_temp_file.write("%6d, // %s\n" % (index, country))
- index += len(country) + 1
- data_temp_file.write("};\n")
-
- data_temp_file.write("\n")
-
- # Language code list
- data_temp_file.write("static const unsigned char language_code_list[] =\n")
- for key in language_map.keys():
- code = language_map[key][1]
- if len(code) == 2:
- code += r"\0"
- data_temp_file.write('"%2s" // %s\n' % (code, language_map[key][0]))
- data_temp_file.write(";\n")
-
- data_temp_file.write("\n")
-
- # Script code list
- data_temp_file.write("static const unsigned char script_code_list[] =\n")
- for key in script_map.keys():
- code = script_map[key][1]
- for i in range(4 - len(code)):
- code += "\\0"
- data_temp_file.write('"%2s" // %s\n' % (code, script_map[key][0]))
- data_temp_file.write(";\n")
-
- # Country code list
- data_temp_file.write("static const unsigned char country_code_list[] =\n")
- for key in country_map.keys():
- code = country_map[key][1]
- if len(code) == 2:
- code += "\\0"
- data_temp_file.write('"%2s" // %s\n' % (code, country_map[key][0]))
- data_temp_file.write(";\n")
-
- data_temp_file.write("\n")
- data_temp_file.write(GENERATED_BLOCK_END)
- s = qlocaledata_file.readline()
- # skip until end of the old block
- while s and s != GENERATED_BLOCK_END:
- s = qlocaledata_file.readline()
-
- s = qlocaledata_file.readline()
- while s:
- data_temp_file.write(s)
- s = qlocaledata_file.readline()
- data_temp_file.close()
- qlocaledata_file.close()
-
- os.remove(qtsrcdir + "/src/corelib/text/qlocale_data_p.h")
- os.rename(data_temp_file_path, qtsrcdir + "/src/corelib/text/qlocale_data_p.h")
+ try:
+ writer = LocaleDataWriter(os.path.join(qtsrcdir, 'src', 'corelib', 'text',
+ 'qlocale_data_p.h'),
+ qtsrcdir, reader.cldrVersion)
+ except IOError as e:
+ err.write('Failed to open files to transcribe locale data: ' + (e.message or e.args[1]))
+ return 1
+
+ try:
+ writer.likelySubtags(reader.likelyMap())
+ writer.localeIndex(reader.languageIndices(tuple(k[0] for k in locale_map)))
+ writer.localeData(locale_map, locale_keys)
+ writer.writer.write('\n')
+ writer.languageNames(reader.languages)
+ writer.scriptNames(reader.scripts)
+ writer.countryNames(reader.countries)
+ # TODO: merge the next three into the previous three
+ writer.languageCodes(reader.languages)
+ writer.scriptCodes(reader.scripts)
+ writer.countryCodes(reader.countries)
+ except Error as e:
+ writer.cleanup()
+ err.write('\nError updating locale data: ' + e.message + '\n')
+ return 1
+
+ writer.close()
# Generate calendar data
- calendar_format = ' {%6d,%6d,%6d' + ',%5d' * 6 + ',%3d' * 6 + ' },'
for calendar, stem in calendars.items():
- months_data = StringData('months_data')
- calendar_data_file = "q%scalendar_data_p.h" % stem
- calendar_template_file = open(os.path.join(qtsrcdir, 'src', 'corelib', 'time',
- calendar_data_file), "r")
- (calendar_temp_file, calendar_temp_file_path) = tempfile.mkstemp(calendar_data_file, dir=qtsrcdir)
- calendar_temp_file = os.fdopen(calendar_temp_file, "w")
- s = calendar_template_file.readline()
- while s and s != GENERATED_BLOCK_START:
- calendar_temp_file.write(s)
- s = calendar_template_file.readline()
- calendar_temp_file.write(GENERATED_BLOCK_START)
- calendar_temp_file.write(generated_template % (datetime.date.today(), cldr_version))
- calendar_temp_file.write("static const QCalendarLocale locale_data[] = {\n")
- calendar_temp_file.write(' // '
- # IDs, width 7 (6 + comma)
- + ' lang '
- + ' script'
- + ' terr '
- # Month-name start-indices, width 6 (5 + comma):
- + 'sLng '
- + 'long '
- + 'sSrt '
- + 'shrt '
- + 'sNrw '
- + 'naro '
- # No individual headers for the sizes.
- + 'Sizes...'
- + '\n')
- for key in locale_keys:
- l = locale_map[key]
- # Sequence of StringDataToken:
- try:
- # Twelve long month names can add up to more than 256 (e.g. kde_TZ: 264)
- ranges = (tuple(months_data.append(m[calendar], 16) for m in
- (l.standaloneLongMonths, l.longMonths)) +
- tuple(months_data.append(m[calendar]) for m in
- (l.standaloneShortMonths, l.shortMonths,
- l.standaloneNarrowMonths, l.narrowMonths)))
- except ValueError as e:
- e.args += (l.language, l.script, l.country, stem)
- raise
+ try:
+ writer = CalendarDataWriter(os.path.join(qtsrcdir, 'src', 'corelib', 'time',
+ 'q{}calendar_data_p.h'.format(stem)),
+ qtsrcdir, reader.cldrVersion)
+ except IOError as e:
+ err.write('Failed to open files to transcribe ' + calendar
+ + ' data ' + (e.message or e.args[1]))
+ return 1
+
+ try:
+ writer.write(calendar, locale_map, locale_keys)
+ except Error as e:
+ writer.cleanup()
+ err.write('\nError updating ' + calendar + ' locale data: ' + e.message + '\n')
+ return 1
- calendar_temp_file.write(
- calendar_format
- % ((key[0], key[1], key[2]) +
- tuple(r.index for r in ranges) +
- tuple(r.length for r in ranges))
- + "// %s/%s/%s\n" % (l.language, l.script, l.country))
- calendar_temp_file.write(calendar_format % ( (0,) * (3 + 6 * 2) )
- + '// trailing zeros\n')
- calendar_temp_file.write("};\n")
- months_data.write(calendar_temp_file)
- s = calendar_template_file.readline()
- while s and s != GENERATED_BLOCK_END:
- s = calendar_template_file.readline()
- while s:
- calendar_temp_file.write(s)
- s = calendar_template_file.readline()
- os.rename(calendar_temp_file_path,
- os.path.join(qtsrcdir, 'src', 'corelib', 'time', calendar_data_file))
+ writer.close()
# qlocale.h
+ try:
+ writer = LocaleHeaderWriter(os.path.join(qtsrcdir, 'src', 'corelib', 'text', 'qlocale.h'),
+ qtsrcdir, reader.dupes)
+ except IOError as e:
+ err.write('Failed to open files to transcribe qlocale.h: ' + (e.message or e.args[1]))
+ return 1
- (qlocaleh_temp_file, qlocaleh_temp_file_path) = tempfile.mkstemp("qlocale.h", dir=qtsrcdir)
- qlocaleh_temp_file = os.fdopen(qlocaleh_temp_file, "w")
- qlocaleh_file = open(qtsrcdir + "/src/corelib/text/qlocale.h", "r")
- s = qlocaleh_file.readline()
- while s and s != GENERATED_BLOCK_START:
- qlocaleh_temp_file.write(s)
- s = qlocaleh_file.readline()
- qlocaleh_temp_file.write(GENERATED_BLOCK_START)
- qlocaleh_temp_file.write("// see qlocale_data_p.h for more info on generated data\n")
-
- # Language enum
- qlocaleh_temp_file.write(" enum Language {\n")
- language = None
- for key, value in language_map.items():
- language = fixedLanguageName(value[0], dupes)
- qlocaleh_temp_file.write(" " + language + " = " + str(key) + ",\n")
-
- qlocaleh_temp_file.write("\n " +
- ",\n ".join('%s = %s' % pair
- for pair in sorted(language_aliases.items())) +
- ",\n")
- qlocaleh_temp_file.write("\n")
- qlocaleh_temp_file.write(" LastLanguage = " + language + "\n")
- qlocaleh_temp_file.write(" };\n\n")
-
- # Script enum
- qlocaleh_temp_file.write(" enum Script {\n")
- script = None
- for key, value in script_map.items():
- script = fixedScriptName(value[0], dupes)
- qlocaleh_temp_file.write(" " + script + " = " + str(key) + ",\n")
- qlocaleh_temp_file.write("\n " +
- ",\n ".join('%s = %s' % pair
- for pair in sorted(script_aliases.items())) +
- ",\n")
- qlocaleh_temp_file.write("\n")
- qlocaleh_temp_file.write(" LastScript = " + script + "\n")
- qlocaleh_temp_file.write(" };\n\n")
-
- # Country enum
- qlocaleh_temp_file.write(" enum Country {\n")
- country = None
- for key, value in country_map.items():
- country = fixedCountryName(value[0], dupes)
- qlocaleh_temp_file.write(" " + country + " = " + str(key) + ",\n")
- qlocaleh_temp_file.write("\n " +
- ",\n ".join('%s = %s' % pair
- for pair in sorted(country_aliases.items())) +
- ",\n")
- qlocaleh_temp_file.write("\n")
- qlocaleh_temp_file.write(" LastCountry = " + country + "\n")
- qlocaleh_temp_file.write(" };\n")
-
- qlocaleh_temp_file.write(GENERATED_BLOCK_END)
- s = qlocaleh_file.readline()
- # skip until end of the old block
- while s and s != GENERATED_BLOCK_END:
- s = qlocaleh_file.readline()
-
- s = qlocaleh_file.readline()
- while s:
- qlocaleh_temp_file.write(s)
- s = qlocaleh_file.readline()
- qlocaleh_temp_file.close()
- qlocaleh_file.close()
-
- os.remove(qtsrcdir + "/src/corelib/text/qlocale.h")
- os.rename(qlocaleh_temp_file_path, qtsrcdir + "/src/corelib/text/qlocale.h")
+ try:
+ writer.languages(reader.languages)
+ writer.scripts(reader.scripts)
+ writer.countries(reader.countries)
+ except Error as e:
+ writer.cleanup()
+ err.write('\nError updating qlocale.h: ' + e.message + '\n')
+ return 1
+
+ writer.close()
# qlocale.qdoc
+ try:
+ writer = Transcriber(os.path.join(qtsrcdir, 'src', 'corelib', 'text', 'qlocale.qdoc'),
+ qtsrcdir)
+ except IOError as e:
+ err.write('Failed to open files to transcribe qlocale.qdoc: ' + (e.message or e.args[1]))
+ return 1
- (qlocaleqdoc_temp_file, qlocaleqdoc_temp_file_path) = tempfile.mkstemp("qlocale.qdoc", dir=qtsrcdir)
- qlocaleqdoc_temp_file = os.fdopen(qlocaleqdoc_temp_file, "w")
- qlocaleqdoc_file = open(qtsrcdir + "/src/corelib/text/qlocale.qdoc", "r")
- s = qlocaleqdoc_file.readline()
DOCSTRING = " QLocale's data is based on Common Locale Data Repository "
- while s:
- if DOCSTRING in s:
- qlocaleqdoc_temp_file.write(DOCSTRING + "v" + cldr_version + ".\n")
- else:
- qlocaleqdoc_temp_file.write(s)
- s = qlocaleqdoc_file.readline()
- qlocaleqdoc_temp_file.close()
- qlocaleqdoc_file.close()
-
- os.remove(qtsrcdir + "/src/corelib/text/qlocale.qdoc")
- os.rename(qlocaleqdoc_temp_file_path, qtsrcdir + "/src/corelib/text/qlocale.qdoc")
+ try:
+ for line in writer.reader:
+ if DOCSTRING in line:
+ writer.writer.write(DOCSTRING + 'v' + reader.cldrVersion + '.\n')
+ else:
+ writer.writer.write(line)
+ except Error as e:
+ writer.cleanup()
+ err.write('\nError updating qlocale.qdoc: ' + e.message + '\n')
+ return 1
+
+ writer.close()
+ return 0
if __name__ == "__main__":
- main()
+ import sys
+ sys.exit(main(sys.argv, sys.stdout, sys.stderr))
diff --git a/util/locale_database/xpathlite.py b/util/locale_database/xpathlite.py
deleted file mode 100644
index 97efaaab41..0000000000
--- a/util/locale_database/xpathlite.py
+++ /dev/null
@@ -1,288 +0,0 @@
-#!/usr/bin/env python
-#############################################################################
-##
-## Copyright (C) 2016 The Qt Company Ltd.
-## Contact: https://www.qt.io/licensing/
-##
-## This file is part of the test suite of the Qt Toolkit.
-##
-## $QT_BEGIN_LICENSE:GPL-EXCEPT$
-## Commercial License Usage
-## Licensees holding valid commercial Qt licenses may use this file in
-## accordance with the commercial license agreement provided with the
-## Software or, alternatively, in accordance with the terms contained in
-## a written agreement between you and The Qt Company. For licensing terms
-## and conditions see https://www.qt.io/terms-conditions. For further
-## information use the contact form at https://www.qt.io/contact-us.
-##
-## GNU General Public License Usage
-## Alternatively, this file may be used under the terms of the GNU
-## General Public License version 3 as published by the Free Software
-## Foundation with exceptions as appearing in the file LICENSE.GPL3-EXCEPT
-## included in the packaging of this file. Please review the following
-## information to ensure the GNU General Public License requirements will
-## be met: https://www.gnu.org/licenses/gpl-3.0.html.
-##
-## $QT_END_LICENSE$
-##
-#############################################################################
-
-import sys
-import os
-import xml.dom.minidom
-
-class DraftResolution:
- # See http://www.unicode.org/cldr/process.html for description
- unconfirmed = 'unconfirmed'
- provisional = 'provisional'
- contributed = 'contributed'
- approved = 'approved'
- _values = { unconfirmed : 1, provisional : 2, contributed : 3, approved : 4 }
- def __init__(self, resolution):
- self.resolution = resolution
- def toInt(self):
- return DraftResolution._values[self.resolution]
-
-class Error:
- def __init__(self, msg):
- self.msg = msg
- def __str__(self):
- return self.msg
-
-doc_cache = {}
-def parseDoc(file):
- if not doc_cache.has_key(file):
- doc_cache[file] = xml.dom.minidom.parse(file)
- return doc_cache[file]
-
-def findChild(parent, tag_name, arg_name=None, arg_value=None, draft=None):
- for node in parent.childNodes:
- if node.nodeType != node.ELEMENT_NODE:
- continue
- if node.nodeName != tag_name:
- continue
- if arg_value:
- if not node.attributes.has_key(arg_name):
- continue
- if node.attributes[arg_name].nodeValue != arg_value:
- continue
- if draft:
- if not node.attributes.has_key('draft'):
- # if draft is not specified then it's approved
- return node
- value = node.attributes['draft'].nodeValue
- value = DraftResolution(value).toInt()
- exemplar = DraftResolution(draft).toInt()
- if exemplar > value:
- continue
- return node
- return False
-
-def codeMapsFromFile(file):
- """Extract mappings of language, script and country codes to names.
-
- The file shall typically be common/main/en.xml, which contains a
- localeDisplayNames element with children languages, scripts and
- territories; each element in each of these has a code as its type
- attribute and its name as element content. This returns a mapping
- withe keys 'language', 'script' and 'country', each of which
- has, as value, a mapping of the relevant codes to names.
- """
- parent = findChild(findChild(parseDoc(file), 'ldml'), 'localeDisplayNames')
- keys, result = {'languages': 'language', 'scripts': 'script', 'territories': 'country'}, {}
- for src, dst in keys.items():
- child = findChild(parent, src)
- data = result[dst] = {}
- for elt in child.childNodes:
- if elt.attributes and elt.attributes.has_key('type'):
- key, value = elt.attributes['type'].value, elt.childNodes[0].wholeText
- # Don't over-write previously-read data for an alt form:
- if elt.attributes.has_key('alt') and data.has_key(key):
- continue
- data[key] = value
-
- return result
-
-def findTagsInFile(file, path):
- doc = parseDoc(file)
-
- elt = doc.documentElement
- tag_spec_list = path.split("/")
- last_entry = None
- for tag_spec in tag_spec_list:
- tag_name = tag_spec
- arg_name = 'type'
- arg_value = ''
- left_bracket = tag_spec.find('[')
- if left_bracket != -1:
- tag_name = tag_spec[:left_bracket]
- arg_value = tag_spec[left_bracket+1:-1].split("=")
- if len(arg_value) == 2:
- arg_name = arg_value[0]
- arg_value = arg_value[1]
- else:
- arg_value = arg_value[0]
- elt = findChild(elt, tag_name, arg_name, arg_value)
- if not elt:
- return None
- ret = []
- if elt.childNodes:
- for node in elt.childNodes:
- if node.attributes:
- element = [node.nodeName, None]
- element[1] = node.attributes.items()
- ret.append(element)
- else:
- if elt.attributes:
- element = [elt.nodeName, None]
- element[1] = elt.attributes.items()
- ret.append(element)
- return ret
-
-def _findEntryInFile(file, path, draft=None, attribute=None):
- doc = parseDoc(file)
-
- elt = doc.documentElement
- tag_spec_list = path.split("/")
- last_entry = None
- for i in range(len(tag_spec_list)):
- tag_spec = tag_spec_list[i]
- tag_name = tag_spec
- arg_name = 'type'
- arg_value = ''
- left_bracket = tag_spec.find('[')
- if left_bracket != -1:
- tag_name = tag_spec[:left_bracket]
- arg_value = tag_spec[left_bracket+1:-1].split("=")
- if len(arg_value) == 2:
- arg_name = arg_value[0].replace("@", "").replace("'", "")
- arg_value = arg_value[1]
- else:
- arg_value = arg_value[0]
- alias = findChild(elt, 'alias')
- if alias and alias.attributes['source'].nodeValue == 'locale':
- path = alias.attributes['path'].nodeValue
- aliaspath = tag_spec_list[:i] + path.split("/")
- def resolve(x, y):
- if y == '..':
- return x[:-1]
- return x + [y]
- # resolve all dot-dot parts of the path
- aliaspath = reduce(resolve, aliaspath, [])
- # remove attribute specification that our xpathlite doesnt support
- aliaspath = map(lambda x: x.replace("@type=", "").replace("'", ""), aliaspath)
- # append the remaining path
- aliaspath = aliaspath + tag_spec_list[i:]
- aliaspath = "/".join(aliaspath)
- # "locale" aliases are special - we need to start lookup from scratch
- return (None, aliaspath)
- elt = findChild(elt, tag_name, arg_name, arg_value, draft)
- if not elt:
- return ("", None)
- if attribute is not None:
- if elt.attributes.has_key(attribute):
- return (elt.attributes[attribute].nodeValue, None)
- return (None, None)
- try:
- return (elt.firstChild.nodeValue, None)
- except:
- pass
- return (None, None)
-
-def findAlias(file):
- doc = parseDoc(file)
-
- alias_elt = findChild(doc.documentElement, "alias")
- if not alias_elt:
- return False
- if not alias_elt.attributes.has_key('source'):
- return False
- return alias_elt.attributes['source'].nodeValue
-
-lookup_chain_cache = {}
-parent_locales = {}
-def _fixedLookupChain(dirname, name):
- if lookup_chain_cache.has_key(name):
- return lookup_chain_cache[name]
-
- # see http://www.unicode.org/reports/tr35/#Parent_Locales
- if not parent_locales:
- for ns in findTagsInFile(dirname + "/../supplemental/supplementalData.xml", "parentLocales"):
- tmp = {}
- parent_locale = ""
- for data in ns[1:][0]: # ns looks like this: [u'parentLocale', [(u'parent', u'root'), (u'locales', u'az_Cyrl bs_Cyrl en_Dsrt ..')]]
- tmp[data[0]] = data[1]
- if data[0] == u"parent":
- parent_locale = data[1]
- parent_locales[parent_locale] = tmp[u"locales"].split(" ")
-
- items = name.split("_")
- # split locale name into items and iterate through them from back to front
- # example: az_Latn_AZ => [az_Latn_AZ, az_Latn, az]
- items = list(reversed(map(lambda x: "_".join(items[:x+1]), range(len(items)))))
-
- for i in range(len(items)):
- item = items[i]
- for parent_locale in parent_locales.keys():
- for locale in parent_locales[parent_locale]:
- if item == locale:
- if parent_locale == u"root":
- items = items[:i+1]
- else:
- items = items[:i+1] + _fixedLookupChain(dirname, parent_locale)
- lookup_chain_cache[name] = items
- return items
-
- lookup_chain_cache[name] = items
- return items
-
-def _findEntry(base, path, draft=None, attribute=None):
- if base.endswith(".xml"):
- base = base[:-4]
- (dirname, filename) = os.path.split(base)
-
- items = _fixedLookupChain(dirname, filename)
- for item in items:
- file = dirname + "/" + item + ".xml"
- if os.path.isfile(file):
- alias = findAlias(file)
- if alias:
- # if alias is found we should follow it and stop processing current file
- # see http://www.unicode.org/reports/tr35/#Common_Elements
- aliasfile = os.path.dirname(file) + "/" + alias + ".xml"
- if not os.path.isfile(aliasfile):
- raise Error("findEntry: fatal error: found an alias '%s' to '%s', but the alias file couldn't be found" % (filename, alias))
- # found an alias, recurse into parsing it
- result = _findEntry(aliasfile, path, draft, attribute)
- return result
- (result, aliaspath) = _findEntryInFile(file, path, draft, attribute)
- if aliaspath:
- # start lookup again because of the alias source="locale"
- return _findEntry(base, aliaspath, draft, attribute)
- if result:
- return result
- return None
-
-def findEntry(base, path, draft=None, attribute=None):
- file = base
- if base.endswith(".xml"):
- file = base
- base = base[:-4]
- else:
- file = base + ".xml"
- (dirname, filename) = os.path.split(base)
-
- result = None
- while path:
- result = _findEntry(base, path, draft, attribute)
- if result:
- return result
- (result, aliaspath) = _findEntryInFile(dirname + "/root.xml", path, draft, attribute)
- if result:
- return result
- if not aliaspath:
- raise Error("findEntry: fatal error: %s: cannot find key %s" % (filename, path))
- path = aliaspath
-
- return result
-