diff options
author | Qt Forward Merge Bot <qt_forward_merge_bot@qt-project.org> | 2020-04-07 01:00:12 +0200 |
---|---|---|
committer | Fabian Kosmale <fabian.kosmale@qt.io> | 2020-04-08 22:04:23 +0200 |
commit | c937ed8af4f3dfef3fd8f8c2a9815376790dd5bf (patch) | |
tree | 5175aff87e160ae8f32dadc60d3cfd38b73d4fb1 /util | |
parent | e0346df1b21cb30b54ae8d4918addc9925fa8479 (diff) | |
parent | 8823bb8d306d78dd6a2e121a708dc607beff58c8 (diff) |
Merge "Merge remote-tracking branch 'origin/5.15' into dev"
Diffstat (limited to 'util')
-rw-r--r-- | util/locale_database/cldr.py | 718 | ||||
-rwxr-xr-x | util/locale_database/cldr2qlocalexml.py | 705 | ||||
-rwxr-xr-x | util/locale_database/cldr2qtimezone.py | 369 | ||||
-rw-r--r-- | util/locale_database/ldml.py | 589 | ||||
-rw-r--r-- | util/locale_database/localetools.py | 164 | ||||
-rw-r--r-- | util/locale_database/qlocalexml.py | 368 | ||||
-rwxr-xr-x | util/locale_database/qlocalexml2cpp.py | 1252 | ||||
-rw-r--r-- | util/locale_database/xpathlite.py | 288 |
8 files changed, 2511 insertions, 1942 deletions
diff --git a/util/locale_database/cldr.py b/util/locale_database/cldr.py new file mode 100644 index 0000000000..4b54f50080 --- /dev/null +++ b/util/locale_database/cldr.py @@ -0,0 +1,718 @@ +############################################################################# +## +## Copyright (C) 2020 The Qt Company Ltd. +## Contact: https://www.qt.io/licensing/ +## +## This file is part of the test suite of the Qt Toolkit. +## +## $QT_BEGIN_LICENSE:GPL-EXCEPT$ +## Commercial License Usage +## Licensees holding valid commercial Qt licenses may use this file in +## accordance with the commercial license agreement provided with the +## Software or, alternatively, in accordance with the terms contained in +## a written agreement between you and The Qt Company. For licensing terms +## and conditions see https://www.qt.io/terms-conditions. For further +## information use the contact form at https://www.qt.io/contact-us. +## +## GNU General Public License Usage +## Alternatively, this file may be used under the terms of the GNU +## General Public License version 3 as published by the Free Software +## Foundation with exceptions as appearing in the file LICENSE.GPL3-EXCEPT +## included in the packaging of this file. Please review the following +## information to ensure the GNU General Public License requirements will +## be met: https://www.gnu.org/licenses/gpl-3.0.html. +## +## $QT_END_LICENSE$ +## +############################################################################# +"""Digesting the CLDR's data. + +Provides two classes: + CldrReader -- driver for reading CLDR data + CldrAccess -- used by the reader to access the tree of data files + +The former should normally be all you need to access. +See individual classes for further detail. +""" + +from xml.dom import minidom +from weakref import WeakValueDictionary as CacheDict +import os + +from ldml import Error, Node, XmlScanner, Supplement, LocaleScanner +from qlocalexml import Locale + +class CldrReader (object): + def __init__(self, root, grumble = lambda msg: None, whitter = lambda msg: None): + """Set up a reader object for reading CLDR data. + + Single parameter, root, is the file-system path to the root of + the unpacked CLDR archive; its common/ sub-directory should + contain dtd/, main/ and supplemental/ sub-directories. + + Optional second argument, grumble, is a callable that logs + warnings and complaints, e.g. sys.stderr.write would be a + suitable callable. The default is a no-op that ignores its + single argument. Optional third argument is similar, used for + less interesting output; pass sys.stderr.write for it for + verbose output.""" + self.root = CldrAccess(root) + self.whitter, self.grumble = whitter, grumble + + def likelySubTags(self): + """Generator for likely subtag information. + + Yields pairs (have, give) of 4-tuples; if what you have + matches the left member, giving the right member is probably + sensible. Each 4-tuple's entries are the full names of a + language, a script, a country (strictly territory) and a + variant (currently ignored).""" + skips = [] + for got, use in self.root.likelySubTags(): + try: + have = self.__parseTags(got) + give = self.__parseTags(use) + except Error as e: + if ((use.startswith(got) or got.startswith('und_')) + and e.message.startswith('Unknown ') and ' code ' in e.message): + skips.append(use) + else: + self.grumble('Skipping likelySubtag "{}" -> "{}" ({})\n'.format(got, use, e.message)) + continue + if all(code.startswith('Any') and code[3].isupper() for code in have[:-1]): + continue + + give = (give[0], + # Substitute according to http://www.unicode.org/reports/tr35/#Likely_Subtags + have[1] if give[1] == 'AnyScript' else give[1], + have[2] if give[2] == 'AnyCountry' else give[2], + give[3]) # AnyVariant similarly ? + + yield have, give + + if skips: + # TODO: look at LDML's reserved locale tag names; they + # show up a lot in this, and may be grounds for filtering + # more out. + pass # self.__wrapped(self.whitter, 'Skipping likelySubtags (for unknown codes): ', skips) + + def readLocales(self, calendars = ('gregorian',)): + locales = tuple(self.__allLocales(calendars)) + return dict(((k.language_id, k.script_id, k.country_id, k.variant_code), + k) for k in locales) + + def __allLocales(self, calendars): + def skip(locale, reason): + return 'Skipping defaultContent locale "{}" ({})\n'.format(locale, reason) + + for locale in self.root.defaultContentLocales: + try: + language, script, country, variant = self.__splitLocale(locale) + except ValueError: + self.whitter(skip(locale, 'only language tag')) + continue + + if not (script or country): + self.grumble(skip(locale, 'second tag is neither script nor territory')) + continue + + if not (language and country): + continue + + try: + yield self.__getLocaleData(self.root.locale(locale), calendars, + language, script, country, variant) + except Error as e: + self.grumble(skip(locale, e.message)) + + for locale in self.root.fileLocales: + try: + chain = self.root.locale(locale) + language, script, country, variant = chain.tagCodes() + assert language + # TODO: this skip should probably be based on likely + # sub-tags, instead of empty country: if locale has a + # likely-subtag expansion, that's what QLocale uses, + # and we'll be saving its data for the expanded locale + # anyway, so don't need to record it for itself. + # See also QLocaleXmlReader.loadLocaleMap's grumble. + if not country: + continue + yield self.__getLocaleData(chain, calendars, language, script, country, variant) + except Error as e: + self.grumble('Skipping file locale "{}" ({})\n'.format(locale, e.message)) + + import textwrap + @staticmethod + def __wrapped(writer, prefix, tokens, wrap = textwrap.wrap): + writer('\n'.join(wrap(prefix + ', '.join(tokens), + subsequent_indent=' ', width=80)) + '\n') + del textwrap + + def __parseTags(self, locale): + tags = self.__splitLocale(locale) + language = tags.next() + script = country = variant = '' + try: + script, country, variant = tags + except ValueError: + pass + return tuple(p[1] for p in self.root.codesToIdName(language, script, country, variant)) + + def __splitLocale(self, name): + """Generate (language, script, territory, variant) from a locale name + + Ignores any trailing fields (with a warning), leaves script (a + capitalised four-letter token), territory (either a number or + an all-uppercase token) or variant (upper case and digits) + empty if unspecified. Only generates one entry if name is a + single tag (i.e. contains no underscores). Always yields 1 or + 4 values, never 2 or 3.""" + tags = iter(name.split('_')) + yield tags.next() # Language + tag = tags.next() # may raise StopIteration + + # Script is always four letters, always capitalised: + if len(tag) == 4 and tag[0].isupper() and tag[1:].islower(): + yield tag + try: + tag = tags.next() + except StopIteration: + tag = '' + else: + yield '' + + # Territory is upper-case or numeric: + if tag and tag.isupper() or tag.isdigit(): + yield tag + try: + tag = tags.next() + except StopIteration: + tag = '' + else: + yield '' + + # Variant can be any mixture of upper-case and digits. + if tag and all(c.isupper() or c.isdigit() for c in tag): + yield tag + tag = '' + else: + yield '' + + # If nothing is left, StopIteration will avoid the warning: + if not tag: + tag = tags.next() + self.grumble('Ignoring unparsed cruft {} in {}\n'.format('_'.join(tag + tuple(tags)), name)) + + def __getLocaleData(self, scan, calendars, language, script, country, variant): + ids, names = zip(*self.root.codesToIdName(language, script, country, variant)) + assert ids[0] > 0 and ids[2] > 0, (language, script, country, variant) + locale = Locale( + language = names[0], language_code = language, language_id = ids[0], + script = names[1], script_code = script, script_id = ids[1], + country = names[2], country_code = country, country_id = ids[2], + variant_code = variant) + + firstDay, weStart, weEnd = self.root.weekData(country) + assert all(day in ('mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun') + for day in (firstDay, weStart, weEnd)) + + locale.update(firstDayOfWeek = firstDay, + weekendStart = weStart, + weekendEnd = weEnd) + + iso, digits, rounding = self.root.currencyData(country) + locale.update(currencyIsoCode = iso, + currencyDigits = int(digits), + currencyRounding = int(rounding)) + + locale.update(scan.currencyData(iso)) + locale.update(scan.numericData(self.root.numberSystem, self.whitter)) + locale.update(scan.textPatternData()) + locale.update(scan.endonyms(language, script, country, variant)) + locale.update(scan.unitData()) # byte, kB, MB, GB, ..., KiB, MiB, GiB, ... + locale.update(scan.calendarNames(calendars)) # Names of days and months + + return locale + +# Note: various caches assume this class is a singleton, so the +# "default" value for a parameter no caller should pass can serve as +# the cache. If a process were to instantiate this class with distinct +# roots, each cache would be filled by the first to need it ! +class CldrAccess (object): + def __init__(self, root): + """Set up a master object for accessing CLDR data. + + Single parameter, root, is the file-system path to the root of + the unpacked CLDR archive; its common/ sub-directory should + contain dtd/, main/ and supplemental/ sub-directories.""" + self.root = root + + def xml(self, *path): + """Load a single XML file and return its root element as an XmlScanner. + + The path is interpreted relative to self.root""" + return XmlScanner(Node(self.__xml(path))) + + def supplement(self, name): + """Loads supplemental data as a Supplement object. + + The name should be that of a file in common/supplemental/, without path. + """ + return Supplement(Node(self.__xml(('common', 'supplemental', name)))) + + def locale(self, name): + """Loads all data for a locale as a LocaleScanner object. + + The name should be a locale name; adding suffix '.xml' to it + should usually yield a file in common/main/. The returned + LocaleScanner object packages this file along with all those + from which it inherits; its methods know how to handle that + inheritance, where relevant.""" + return LocaleScanner(name, self.__localeRoots(name), self.__rootLocale) + + @property + def fileLocales(self, joinPath = os.path.join, listDirectory = os.listdir, + splitExtension = os.path.splitext): + """Generator for locale IDs seen in file-names. + + All *.xml other than root.xml in common/main/ are assumed to + identify locales.""" + for name in listDirectory(joinPath(self.root, 'common', 'main')): + stem, ext = splitExtension(name) + if ext == '.xml' and stem != 'root': + yield stem + + @property + def defaultContentLocales(self): + """Generator for the default content locales.""" + for name, attrs in self.supplement('supplementalMetadata.xml').find('metadata/defaultContent'): + try: + locales = attrs['locales'] + except KeyError: + pass + else: + for locale in locales.split(): + yield locale + + def likelySubTags(self): + for ignore, attrs in self.supplement('likelySubtags.xml').find('likelySubtags'): + yield attrs['from'], attrs['to'] + + def numberSystem(self, system): + """Get a description of a numbering system. + + Returns a mapping, with keys u'digits', u'type' and u'id'; the + value for this last is system. Raises KeyError for unknown + number system, ldml.Error on failure to load data.""" + try: + return self.__numberSystems[system] + except KeyError: + raise Error('Unsupported number system: {}'.format(system)) + + def weekData(self, country): + """Data on the weekly cycle. + + Returns a triple (W, S, E) of en's short names for week-days; + W is the first day of the week, S the start of the week-end + and E the end of the week-end. Where data for a country is + unavailable, the data for CLDR's territory 001 (The World) is + used.""" + try: + return self.__weekData[country] + except KeyError: + return self.__weekData['001'] + + def currencyData(self, country): + """Returns currency data for the given country code. + + Return value is a tuple (ISO4217 code, digit count, rounding + mode). If CLDR provides no data for this country, ('', 2, 1) + is the default result. + """ + try: + return self.__currencyData[country] + except KeyError: + return '', 2, 1 + + def codesToIdName(self, language, script, country, variant = ''): + """Maps each code to the appropriate ID and name. + + Returns a 4-tuple of (ID, name) pairs corresponding to the + language, script, country and variant given. Raises a + suitable error if any of them is unknown, indicating all that + are unknown plus suitable names for any that could sensibly be + added to enumdata.py to make them known. + + Until we implement variant support (QTBUG-81051), the fourth + member of the returned tuple is always 0 paired with a string + that should not be used.""" + enum = self.__enumMap + try: + return (enum('language')[language], + enum('script')[script], + enum('country')[country], + enum('variant')[variant]) + except KeyError: + pass + + parts, values = [], [language, script, country, variant] + for index, key in enumerate(('language', 'script', 'country', 'variant')): + naming, enums = self.__codeMap(key), enum(key) + value = values[index] + if value not in enums: + text = '{} code {}'.format(key, value) + name = naming.get(value) + if name and value != 'POSIX': + text += u' (could add {})'.format(name) + parts.append(text) + if len(parts) > 1: + parts[-1] = 'and ' + parts[-1] + assert parts + raise Error('Unknown ' + ', '.join(parts), + language, script, country, variant) + + def readWindowsTimeZones(self, lookup): # For use by cldr2qtimezone.py + """Digest CLDR's MS-Win time-zone name mapping. + + MS-Win have their own eccentric names for time-zones. CLDR + helpfully provides a translation to more orthodox names. + + Singe argument, lookup, is a mapping from known MS-Win names + for locales to a unique integer index (starting at 1). + + The XML structure we read has the form: + + <supplementalData> + <windowsZones> + <mapTimezones otherVersion="..." typeVersion="..."> + <!-- (UTC-08:00) Pacific Time (US & Canada) --> + <mapZone other="Pacific Standard Time" territory="001" type="America/Los_Angeles"/> + <mapZone other="Pacific Standard Time" territory="CA" type="America/Vancouver America/Dawson America/Whitehorse"/> + <mapZone other="Pacific Standard Time" territory="US" type="America/Los_Angeles America/Metlakatla"/> + <mapZone other="Pacific Standard Time" territory="ZZ" type="PST8PDT"/> + </mapTimezones> + </windowsZones> + </supplementalData> +""" + zones = self.supplement('windowsZones.xml') + enum = self.__enumMap('country') + badZones, unLands, defaults, windows = set(), set(), {}, {} + + for name, attrs in zones.find('windowsZones/mapTimezones'): + if name != 'mapZone': + continue + + wid, code = attrs['other'], attrs['territory'] + data = dict(windowsId = wid, + countryCode = code, + ianaList = attrs['type']) + + try: + key = lookup[wid] + except KeyError: + badZones.add(wid) + key = 0 + data['windowsKey'] = key + + if code == u'001': + defaults[key] = data['ianaList'] + else: + try: + cid, name = enum[code] + except KeyError: + unLands.append(code) + continue + data.update(countryId = cid, country = name) + windows[key, cid] = data + + if unLands: + raise Error('Unknown country codes, please add to enumdata.py: ' + + ', '.join(sorted(unLands))) + + if badZones: + raise Error('Unknown Windows IDs, please add to cldr2qtimezone.py: ' + + ', '.join(sorted(badZones))) + + return self.cldrVersion, defaults, windows + + @property + def cldrVersion(self): + # Evaluate so as to ensure __cldrVersion is set: + self.__unDistinguishedAttributes + return self.__cldrVersion + + # Implementation details + def __xml(self, path, cache = CacheDict(), read = minidom.parse, joinPath = os.path.join): + try: + doc = cache[path] + except KeyError: + cache[path] = doc = read(joinPath(self.root, *path)).documentElement + return doc + + def __open(self, path, joinPath=os.path.join): + return open(joinPath(self.root, *path)) + + @property + def __rootLocale(self, cache = []): + if not cache: + cache.append(self.xml('common', 'main', 'root.xml')) + return cache[0] + + @property + def __supplementalData(self, cache = []): + if not cache: + cache.append(self.supplement('supplementalData.xml')) + return cache[0] + + @property + def __numberSystems(self, cache = {}, joinPath=os.path.join): + if not cache: + for ignore, attrs in self.supplement('numberingSystems.xml').find('numberingSystems'): + cache[attrs['id']] = attrs + assert cache + return cache + + @property + def __weekData(self, cache = {}): + if not cache: + firstDay, weStart, weEnd = self.__getWeekData() + # Massage those into an easily-consulted form: + # World defaults given for code '001': + mon, sat, sun = firstDay['001'], weStart['001'], weEnd['001'] + lands = set(firstDay) | set(weStart) | set(weEnd) + cache.update((land, + (firstDay.get(land, mon), weStart.get(land, sat), weEnd.get(land, sun))) + for land in lands) + assert cache + return cache + + def __getWeekData(self): + """Scan for data on the weekly cycle. + + Yields three mappings from locales to en's short names for + week-days; if a locale isn't a key of a given mapping, it + should use the '001' (world) locale's value. The first mapping + gives the day on which the week starts, the second gives the + day on which the week-end starts, the third gives the last day + of the week-end.""" + source = self.__supplementalData + for key in ('firstDay', 'weekendStart', 'weekendEnd'): + result = {} + for ignore, attrs in source.find('weekData/' + key): + assert ignore == key + day = attrs['day'] + assert day in ('mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun'), day + if 'alt' in attrs: + continue + for loc in attrs.get('territories', '').split(): + result[loc] = day + yield result + + @property + def __currencyData(self, cache = {}): + if not cache: + source = self.__supplementalData + for elt in source.findNodes('currencyData/region'): + iso, digits, rounding = '', 2, 1 + try: + country = elt.dom.attributes['iso3166'].nodeValue + except KeyError: + continue + for child in elt.findAllChildren('currency'): + try: + if child.dom.attributes['tender'].nodeValue == 'false': + continue + except KeyError: + pass + try: + child.dom.attributes['to'] # Is set if this element has gone out of date. + except KeyError: + iso = child.dom.attributes['iso4217'].nodeValue + break + if iso: + for tag, data in source.find( + 'currencyData/fractions/info[iso4217={}]'.format(iso)): + digits = data['digits'] + rounding = data['rounding'] + cache[country] = iso, digits, rounding + assert cache + + return cache + + @property + def __unDistinguishedAttributes(self, cache = {}, joinPath = os.path.join): + """Mapping from tag names to lists of attributes. + + LDML defines some attributes as 'distinguishing': if a node + has distinguishing attributes that weren't specified in an + XPath, a search on that XPath should exclude the node's + children. + + This property is a mapping from tag names to tuples of + attribute names that *aren't* distinguishing for that tag. + Its value is cached (so its costly computation isonly done + once) and there's a side-effect of populating its cache: it + sets self.__cldrVersion to the value found in ldml.dtd, during + parsing.""" + if not cache: + cache.update(self.__scanLdmlDtd()) + assert cache + + return cache + + def __scanLdmlDtd(self, joinPath = os.path.join): + """Scan the LDML DTD, record CLDR version + + Yields (tag, attrs) pairs: on elements with a given tag, + attributes named in its attrs (a tuple) may be ignored in an + XPath search; other attributes are distinguished attributes, + in the terminology of LDML's locale-inheritance rules. + + Sets self.__cldrVersion as a side-effect, since this + information is found in the same file.""" + with self.__open(('common', 'dtd', 'ldml.dtd')) as dtd: + tag, ignored, last = None, None, None + + for line in dtd: + if line.startswith('<!ELEMENT '): + if ignored: + assert tag + yield tag, tuple(ignored) + tag, ignored, last = line.split()[1], [], None + continue + + if line.startswith('<!ATTLIST '): + assert tag is not None + parts = line.split() + assert parts[1] == tag + last = parts[2] + if parts[1:5] == ['version', 'cldrVersion', 'CDATA', '#FIXED']: + # parts[5] is the version, in quotes, although the final > might be stuck on its end: + self.__cldrVersion = parts[5].split('"')[1] + continue + + # <!ELEMENT...>s can also be @METADATA, but not @VALUE: + if '<!--@VALUE-->' in line or (last and '<!--@METADATA-->' in line): + assert last is not None + assert ignored is not None + assert tag is not None + ignored.append(last) + last = None # No attribute is both value and metadata + + if tag and ignored: + yield tag, tuple(ignored) + + def __enumMap(self, key, cache = {}): + if not cache: + cache['variant'] = {'': (0, 'This should never be seen outside ldml.py')} + # They're not actually lists: mappings from numeric value + # to pairs of full name and short code. What we want, in + # each case, is a mapping from code to the other two. + from enumdata import language_list, script_list, country_list + for form, book, empty in (('language', language_list, 'AnyLanguage'), + ('script', script_list, 'AnyScript'), + ('country', country_list, 'AnyCountry')): + cache[form] = dict((pair[1], (num, pair[0])) + for num, pair in book.items() if pair[0] != 'C') + # (Have to filter out the C locale, as we give it the + # same (all space) code as AnyLanguage, whose code + # should probably be 'und' instead.) + + # Map empty to zero and the any value: + cache[form][''] = (0, empty) + # and map language code 'und' also to (0, any): + cache['language']['und'] = (0, 'AnyLanguage') + + return cache[key] + + def __codeMap(self, key, cache = {}, + # Maps our name for it to CLDR's name: + naming = {'language': 'languages', 'script': 'scripts', + 'country': 'territories', 'variant': 'variants'}): + if not cache: + root = self.xml('common', 'main', 'en.xml').root.findUniqueChild('localeDisplayNames') + for dst, src in naming.items(): + cache[dst] = dict(self.__codeMapScan(root.findUniqueChild(src))) + assert cache + + return cache[key] + + def __codeMapScan(self, node): + """Get mapping from codes to element values. + + Passed in node is a <languages>, <scripts>, <territories> or + <variants> node, each child of which is a <language>, + <script>, <territory> or <variant> node as appropriate, whose + type is a code (of the appropriate flavour) and content is its + full name. In some cases, two child nodes have the same type; + in these cases, one always has an alt attribute and we should + prefer the other. Yields all such type, content pairs found + in node's children (skipping any with an alt attribute, if + their type has been seen previously).""" + seen = set() + for elt in node.dom.childNodes: + try: + key, value = elt.attributes['type'].nodeValue, elt.childNodes[0].wholeText + except (KeyError, ValueError, TypeError): + pass + else: + if key not in seen or not elt.attributes.has_key('alt'): + yield key, value + seen.add(key) + + # CLDR uses inheritance between locales to save repetition: + def __parentLocale(self, name, cache = {}): + # see http://www.unicode.org/reports/tr35/#Parent_Locales + if not cache: + for tag, attrs in self.__supplementalData.find('parentLocales'): + parent = attrs.get('parent', '') + for child in attrs['locales'].split(): + cache[child] = parent + assert cache + + return cache[name] + + def __localeAsDoc(self, name, aliasFor = None, + joinPath = os.path.join, exists = os.path.isfile): + path = ('common', 'main', name + '.xml') + if exists(joinPath(self.root, *path)): + elt = self.__xml(path) + for child in Node(elt).findAllChildren('alias'): + try: + alias = child.dom.attributes['source'].nodeValue + except (KeyError, AttributeError): + pass + else: + return self.__localeAsDoc(alias, aliasFor or name) + # No alias child with a source: + return elt + + if aliasFor: + raise Error('Fatal error: found an alias "{}" -> "{}", but found no file for the alias' + .format(aliasFor, name)) + + def __scanLocaleRoots(self, name): + while name and name != 'root': + doc = self.__localeAsDoc(name) + if doc is not None: + yield Node(doc, self.__unDistinguishedAttributes) + + try: + name = self.__parentLocale(name) + except KeyError: + try: + name, tail = name.rsplit('_', 1) + except ValueError: # No tail to discard: we're done + break + + class __Seq (list): pass # No weakref for tuple and list, but list sub-class is ok. + def __localeRoots(self, name, cache = CacheDict()): + try: + chain = cache[name] + except KeyError: + cache[name] = chain = self.__Seq(self.__scanLocaleRoots(name)) + return chain + +# Unpolute the namespace: we don't need to export these. +del minidom, CacheDict, os diff --git a/util/locale_database/cldr2qlocalexml.py b/util/locale_database/cldr2qlocalexml.py index 7f98e29d47..c05cabf520 100755 --- a/util/locale_database/cldr2qlocalexml.py +++ b/util/locale_database/cldr2qlocalexml.py @@ -31,15 +31,17 @@ The CLDR data can be downloaded from CLDR_, which has a sub-directory for each version; you need the ``core.zip`` file for your version of -choice (typically the latest). This script has had updates to cope up -to v35; for later versions, we may need adaptations. Unpack the +choice (typically the latest). This script has had updates to cope up +to v35; for later versions, we may need adaptations. Unpack the downloaded ``core.zip`` and check it has a common/main/ sub-directory: -pass the path of that sub-directory to this script as its single -command-line argument. Save its standard output (but not error) to a -file for later processing by ``./qlocalexml2cpp.py`` +pass the path of that root of the download to this script as its first +command-line argument. Pass the name of the file in which to write +output as the second argument; either omit it or use '-' to select the +standard output. This file is the input needed by +``./qlocalexml2cpp.py`` When you update the CLDR data, be sure to also update -src/corelib/text/qt_attribution.json's entry for unicode-cldr. Check +src/corelib/text/qt_attribution.json's entry for unicode-cldr. Check this script's output for unknown language, country or script messages; if any can be resolved, use their entry in common/main/en.xml to append new entries to enumdata.py's lists and update documentation in @@ -54,646 +56,67 @@ time zone names; see cldr2qtimezone.py for details. import os import sys -import re -import textwrap -import enumdata -import xpathlite -from xpathlite import DraftResolution, findAlias, findEntry, findTagsInFile -from dateconverter import convert_date -from qlocalexml import Locale - -# TODO: make calendars a command-line option -calendars = ['gregorian', 'persian', 'islamic'] # 'hebrew' -findEntryInFile = xpathlite._findEntryInFile -def wrappedwarn(prefix, tokens): - return sys.stderr.write( - '\n'.join(textwrap.wrap(prefix + ', '.join(tokens), - subsequent_indent=' ', width=80)) + '\n') - -def parse_number_format(patterns, data): - # this is a very limited parsing of the number format for currency only. - def skip_repeating_pattern(x): - p = x.replace('0', '#').replace(',', '').replace('.', '') - seen = False - result = '' - for c in p: - if c == '#': - if seen: - continue - seen = True - else: - seen = False - result = result + c - return result - patterns = patterns.split(';') - result = [] - for pattern in patterns: - pattern = skip_repeating_pattern(pattern) - pattern = pattern.replace('#', "%1") - # according to http://www.unicode.org/reports/tr35/#Number_Format_Patterns - # there can be doubled or trippled currency sign, however none of the - # locales use that. - pattern = pattern.replace(u'\xa4', "%2") - pattern = pattern.replace("''", "###").replace("'", '').replace("###", "'") - pattern = pattern.replace('-', data['minus']) - pattern = pattern.replace('+', data['plus']) - result.append(pattern) - return result - -def raiseUnknownCode(code, form, cache={}): - """Check whether an unknown code could be supported. - - We declare a language, script or country code unknown if it's not - known to enumdata.py; however, if it's present in main/en.xml's - mapping of codes to names, we have the option of adding support. - This caches the necessary look-up (so we only read main/en.xml - once) and returns the name we should use if we do add support. - - First parameter, code, is the unknown code. Second parameter, - form, is one of 'language', 'script' or 'country' to select the - type of code to look up. Do not pass further parameters (the next - will deprive you of the cache). - - Raises xpathlite.Error with a suitable message, that includes the - unknown code's full name if found. - - Relies on global cldr_dir being set before it's called; see tail - of this file. - """ - if not cache: - cache.update(xpathlite.codeMapsFromFile(os.path.join(cldr_dir, 'en.xml'))) - name = cache[form].get(code) - msg = 'unknown %s code "%s"' % (form, code) - if name: - msg += ' - could use "%s"' % name - raise xpathlite.Error(msg) - -def parse_list_pattern_part_format(pattern): - # This is a very limited parsing of the format for list pattern part only. - return pattern.replace("{0}", "%1").replace("{1}", "%2").replace("{2}", "%3") - -def unit_quantifiers(find, path, stem, suffix, known, - # Stop at exa/exbi: 16 exbi = 2^{64} < zetta = - # 1000^7 < zebi = 2^{70}, the next quantifiers up: - si_quantifiers = ('kilo', 'mega', 'giga', 'tera', 'peta', 'exa')): - """Work out the unit quantifiers. - - Unfortunately, the CLDR data only go up to terabytes and we want - all the way to exabytes; but we can recognize the SI quantifiers - as prefixes, strip and identify the tail as the localized - translation for 'B' (e.g. French has 'octet' for 'byte' and uses - ko, Mo, Go, To from which we can extrapolate Po, Eo). - - Should be called first for the SI quantifiers, with suffix = 'B', - then for the IEC ones, with suffix = 'iB'; the list known - (initially empty before first call) is used to let the second call - know what the first learned about the localized unit. - """ - if suffix == 'B': # first call, known = [] - tail = suffix - for q in si_quantifiers: - it = find(path, stem % q) - # kB for kilobyte, in contrast with KiB for IEC: - q = q[0] if q == 'kilo' else q[0].upper() - if not it: - it = q + tail - elif it.startswith(q): - rest = it[1:] - tail = rest if all(rest == k for k in known) else suffix - known.append(rest) - yield it - else: # second call, re-using first's known - assert suffix == 'iB' - if known: - byte = known.pop() - if all(byte == k for k in known): - suffix = 'i' + byte - for q in si_quantifiers: - yield find(path, stem % q[:2], - # Those don't (yet, v31) exist in CLDR, so we always fall back to: - q[0].upper() + suffix) - -def generateLocaleInfo(path): - if not path.endswith(".xml"): - return {} - - # skip legacy/compatibility ones - alias = findAlias(path) - if alias: - raise xpathlite.Error('alias to "%s"' % alias) - - def code(tag): - return findEntryInFile(path, 'identity/' + tag, attribute="type")[0] - - return _generateLocaleInfo(path, code('language'), code('script'), - code('territory'), code('variant')) - -def getNumberSystems(cache={}): - """Cached look-up of number system information. - - Pass no arguments. Returns a mapping from number system names to, - for each system, a mapping with keys u'digits', u'type' and - u'id'\n""" - if not cache: - for ns in findTagsInFile(os.path.join(cldr_dir, '..', 'supplemental', - 'numberingSystems.xml'), - 'numberingSystems'): - # ns has form: [u'numberingSystem', [(u'digits', u'0123456789'), (u'type', u'numeric'), (u'id', u'latn')]] - entry = dict(ns[1]) - cache[entry[u'id']] = entry - return cache - -def _generateLocaleInfo(path, language_code, script_code, country_code, variant_code=""): - if not path.endswith(".xml"): - return {} - - if language_code == 'root': - # just skip it - return {} - - # we do not support variants - # ### actually there is only one locale with variant: en_US_POSIX - # does anybody care about it at all? - if variant_code: - raise xpathlite.Error('we do not support variants ("%s")' % variant_code) - - language_id = enumdata.languageCodeToId(language_code) - if language_id <= 0: - raiseUnknownCode(language_code, 'language') - - script_id = enumdata.scriptCodeToId(script_code) - if script_id == -1: - raiseUnknownCode(script_code, 'script') - - # we should handle fully qualified names with the territory - if not country_code: - return {} - country_id = enumdata.countryCodeToId(country_code) - if country_id <= 0: - raiseUnknownCode(country_code, 'country') - - # So we say we accept only those values that have "contributed" or - # "approved" resolution. see http://www.unicode.org/cldr/process.html - # But we only respect the resolution for new datas for backward - # compatibility. - draft = DraftResolution.contributed - - result = dict( - language=enumdata.language_list[language_id][0], - language_code=language_code, language_id=language_id, - script=enumdata.script_list[script_id][0], - script_code=script_code, script_id=script_id, - country=enumdata.country_list[country_id][0], - country_code=country_code, country_id=country_id, - variant_code=variant_code) - - (dir_name, file_name) = os.path.split(path) - def from_supplement(tag, - path=os.path.join(dir_name, '..', 'supplemental', - 'supplementalData.xml')): - return findTagsInFile(path, tag) - currencies = from_supplement('currencyData/region[iso3166=%s]' % country_code) - result['currencyIsoCode'] = '' - result['currencyDigits'] = 2 - result['currencyRounding'] = 1 - if currencies: - for e in currencies: - if e[0] == 'currency': - t = [x[1] == 'false' for x in e[1] if x[0] == 'tender'] - if t and t[0]: - pass - elif not any(x[0] == 'to' for x in e[1]): - result['currencyIsoCode'] = (x[1] for x in e[1] if x[0] == 'iso4217').next() - break - if result['currencyIsoCode']: - t = from_supplement("currencyData/fractions/info[iso4217=%s]" - % result['currencyIsoCode']) - if t and t[0][0] == 'info': - result['currencyDigits'] = (int(x[1]) for x in t[0][1] if x[0] == 'digits').next() - result['currencyRounding'] = (int(x[1]) for x in t[0][1] if x[0] == 'rounding').next() - numbering_system = None - try: - numbering_system = findEntry(path, "numbers/defaultNumberingSystem") - except xpathlite.Error: - pass - def findEntryDef(path, xpath, value=''): - try: - return findEntry(path, xpath) - except xpathlite.Error: - return value - def get_number_in_system(path, xpath, numbering_system): - if numbering_system: - try: - return findEntry(path, xpath + "[numberSystem=" + numbering_system + "]") - except xpathlite.Error: - # in CLDR 1.9 number system was refactored for numbers (but not for currency) - # so if previous findEntry doesn't work we should try this: - try: - return findEntry(path, xpath.replace("/symbols/", "/symbols[numberSystem=" + numbering_system + "]/")) - except xpathlite.Error: - # fallback to default - pass - return findEntry(path, xpath) - - result['decimal'] = get_number_in_system(path, "numbers/symbols/decimal", numbering_system) - result['group'] = get_number_in_system(path, "numbers/symbols/group", numbering_system) - assert result['decimal'] != result['group'] - result['list'] = get_number_in_system(path, "numbers/symbols/list", numbering_system) - result['percent'] = get_number_in_system(path, "numbers/symbols/percentSign", numbering_system) - try: - digits = getNumberSystems()[numbering_system][u"digits"]; - assert len(digits) == 10 and all(ord(d) - i == ord(digits[0]) for i, d in enumerate(digits)) - result['zero'] = digits[0] - except Exception as e: - sys.stderr.write("Native zero detection problem: %s\n" % repr(e)) - result['zero'] = get_number_in_system(path, "numbers/symbols/nativeZeroDigit", numbering_system) - result['minus'] = get_number_in_system(path, "numbers/symbols/minusSign", numbering_system) - result['plus'] = get_number_in_system(path, "numbers/symbols/plusSign", numbering_system) - result['exp'] = get_number_in_system(path, "numbers/symbols/exponential", numbering_system) - result['quotationStart'] = findEntry(path, "delimiters/quotationStart") - result['quotationEnd'] = findEntry(path, "delimiters/quotationEnd") - result['alternateQuotationStart'] = findEntry(path, "delimiters/alternateQuotationStart") - result['alternateQuotationEnd'] = findEntry(path, "delimiters/alternateQuotationEnd") - result['listPatternPartStart'] = parse_list_pattern_part_format(findEntry(path, "listPatterns/listPattern/listPatternPart[start]")) - result['listPatternPartMiddle'] = parse_list_pattern_part_format(findEntry(path, "listPatterns/listPattern/listPatternPart[middle]")) - result['listPatternPartEnd'] = parse_list_pattern_part_format(findEntry(path, "listPatterns/listPattern/listPatternPart[end]")) - result['listPatternPartTwo'] = parse_list_pattern_part_format(findEntry(path, "listPatterns/listPattern/listPatternPart[2]")) - result['am'] = findEntry(path, "dates/calendars/calendar[gregorian]/dayPeriods/dayPeriodContext[format]/dayPeriodWidth[wide]/dayPeriod[am]", draft) - result['pm'] = findEntry(path, "dates/calendars/calendar[gregorian]/dayPeriods/dayPeriodContext[format]/dayPeriodWidth[wide]/dayPeriod[pm]", draft) - result['longDateFormat'] = convert_date(findEntry(path, "dates/calendars/calendar[gregorian]/dateFormats/dateFormatLength[full]/dateFormat/pattern")) - result['shortDateFormat'] = convert_date(findEntry(path, "dates/calendars/calendar[gregorian]/dateFormats/dateFormatLength[short]/dateFormat/pattern")) - result['longTimeFormat'] = convert_date(findEntry(path, "dates/calendars/calendar[gregorian]/timeFormats/timeFormatLength[full]/timeFormat/pattern")) - result['shortTimeFormat'] = convert_date(findEntry(path, "dates/calendars/calendar[gregorian]/timeFormats/timeFormatLength[short]/timeFormat/pattern")) - - endonym = None - if country_code and script_code: - endonym = findEntryDef(path, "localeDisplayNames/languages/language[type=%s_%s_%s]" % (language_code, script_code, country_code)) - if not endonym and script_code: - endonym = findEntryDef(path, "localeDisplayNames/languages/language[type=%s_%s]" % (language_code, script_code)) - if not endonym and country_code: - endonym = findEntryDef(path, "localeDisplayNames/languages/language[type=%s_%s]" % (language_code, country_code)) - if not endonym: - endonym = findEntryDef(path, "localeDisplayNames/languages/language[type=%s]" % (language_code)) - result['languageEndonym'] = endonym - result['countryEndonym'] = findEntryDef(path, "localeDisplayNames/territories/territory[type=%s]" % (country_code)) - - currency_format = get_number_in_system(path, "numbers/currencyFormats/currencyFormatLength/currencyFormat/pattern", numbering_system) - currency_format = parse_number_format(currency_format, result) - result['currencyFormat'] = currency_format[0] - result['currencyNegativeFormat'] = '' - if len(currency_format) > 1: - result['currencyNegativeFormat'] = currency_format[1] - - result['currencySymbol'] = '' - result['currencyDisplayName'] = '' - if result['currencyIsoCode']: - stem = "numbers/currencies/currency[%s]/" % result['currencyIsoCode'] - result['currencySymbol'] = findEntryDef(path, stem + 'symbol') - displays = tuple(findEntryDef(path, stem + 'displayName' + tail) - for tail in ('',) + tuple( - '[count=%s]' % x for x in ('zero', 'one', 'two', - 'few', 'many', 'other'))) - while displays and not displays[-1]: - displays = displays[:-1] - result['currencyDisplayName'] = ';'.join(displays) - - def findUnitDef(path, stem, fallback=''): - # The displayName for a quantified unit in en.xml is kByte - # instead of kB (etc.), so prefer any unitPattern provided: - for count in ('many', 'few', 'two', 'other', 'zero', 'one'): - try: - ans = findEntry(path, stem + 'unitPattern[count=%s]' % count) - except xpathlite.Error: - continue - - # TODO: epxloit count-handling, instead of discarding placeholders - if ans.startswith('{0}'): - ans = ans[3:].lstrip() - if ans: - return ans - - return findEntryDef(path, stem + 'displayName', fallback) - - # First without quantifier, then quantified each way: - result['byte_unit'] = findEntryDef( - path, 'units/unitLength[type=long]/unit[type=digital-byte]/displayName', - 'bytes') - stem = 'units/unitLength[type=short]/unit[type=digital-%sbyte]/' - known = [] # cases where we *do* have a given version: - result['byte_si_quantified'] = ';'.join(unit_quantifiers(findUnitDef, path, stem, 'B', known)) - # IEC 60027-2 - # http://physics.nist.gov/cuu/Units/binary.html - result['byte_iec_quantified'] = ';'.join(unit_quantifiers(findUnitDef, path, stem % '%sbi', 'iB', known)) - - # Used for month and day data: - namings = ( - ('standaloneLong', 'stand-alone', 'wide'), - ('standaloneShort', 'stand-alone', 'abbreviated'), - ('standaloneNarrow', 'stand-alone', 'narrow'), - ('long', 'format', 'wide'), - ('short', 'format', 'abbreviated'), - ('narrow', 'format', 'narrow'), - ) - - # Month names for 12-month calendars: - for cal in calendars: - stem = 'dates/calendars/calendar[' + cal + ']/months/' - for (key, mode, size) in namings: - prop = 'monthContext[' + mode + ']/monthWidth[' + size + ']/' - result[key + 'Months_' + cal] = ';'.join( - findEntry(path, stem + prop + "month[%d]" % i) - for i in range(1, 13)) - - # Day data (for Gregorian, at least): - stem = 'dates/calendars/calendar[gregorian]/days/' - days = ('sun', 'mon', 'tue', 'wed', 'thu', 'fri', 'sat') - for (key, mode, size) in namings: - prop = 'dayContext[' + mode + ']/dayWidth[' + size + ']/day' - result[key + 'Days'] = ';'.join( - findEntry(path, stem + prop + '[' + day + ']') - for day in days) - - return Locale(result) - -def addEscapes(s): - result = '' - for c in s: - n = ord(c) - if n < 128: - result += c - else: - result += "\\x" - result += "%02x" % (n) - return result - -def unicodeStr(s): - utf8 = s.encode('utf-8') - return "<size>" + str(len(utf8)) + "</size><data>" + addEscapes(utf8) + "</data>" - -def usage(): - print "Usage: cldr2qlocalexml.py <path-to-cldr-main>" - sys.exit() - -def integrateWeekData(filePath): - if not filePath.endswith(".xml"): - return {} - - def lookup(key): - return findEntryInFile(filePath, key, attribute='territories')[0].split() - days = ('mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun') - - firstDayByCountryCode = {} - for day in days: - for countryCode in lookup('weekData/firstDay[day=%s]' % day): - firstDayByCountryCode[countryCode] = day - - weekendStartByCountryCode = {} - for day in days: - for countryCode in lookup('weekData/weekendStart[day=%s]' % day): - weekendStartByCountryCode[countryCode] = day - - weekendEndByCountryCode = {} - for day in days: - for countryCode in lookup('weekData/weekendEnd[day=%s]' % day): - weekendEndByCountryCode[countryCode] = day - - for (key, locale) in locale_database.iteritems(): - countryCode = locale.country_code - if countryCode in firstDayByCountryCode: - locale.firstDayOfWeek = firstDayByCountryCode[countryCode] - else: - locale.firstDayOfWeek = firstDayByCountryCode["001"] - - if countryCode in weekendStartByCountryCode: - locale.weekendStart = weekendStartByCountryCode[countryCode] - else: - locale.weekendStart = weekendStartByCountryCode["001"] - - if countryCode in weekendEndByCountryCode: - locale.weekendEnd = weekendEndByCountryCode[countryCode] - else: - locale.weekendEnd = weekendEndByCountryCode["001"] - -def splitLocale(name): - """Split name into (language, script, territory) triple as generator. - - Ignores any trailing fields (with a warning), leaves script (a capitalised - four-letter token) or territory (either a number or an all-uppercase token) - empty if unspecified, returns a single-entry generator if name is a single - tag (i.e. contains no underscores). Always yields 1 or 3 values, never 2.""" - tags = iter(name.split('_')) - yield tags.next() # Language - tag = tags.next() - - # Script is always four letters, always capitalised: - if len(tag) == 4 and tag[0].isupper() and tag[1:].islower(): - yield tag - try: - tag = tags.next() - except StopIteration: - tag = '' - else: - yield '' - - # Territory is upper-case or numeric: - if tag and tag.isupper() or tag.isdigit(): - yield tag - tag = '' +from localetools import Error +from cldr import CldrReader +from qlocalexml import QLocaleXmlWriter +from enumdata import language_list, script_list, country_list + +def usage(name, err, message = ''): + err.write("""Usage: {} path/to/cldr/common/main [out-file.xml] +""".format(name)) # TODO: expand command-line, improve help message + if message: + err.write('\n' + message + '\n') + +def main(args, out, err): + # TODO: make calendars a command-line option + calendars = ['gregorian', 'persian', 'islamic'] # 'hebrew' + + # TODO: make argument parsing more sophisticated + name = args.pop(0) + if not args: + usage(name, err, 'Where is your CLDR data tree ?') + return 1 + + root = args.pop(0) + if not os.path.exists(os.path.join(root, 'common', 'main', 'root.xml')): + usage(name, err, + 'First argument is the root of the CLDR tree: found no common/main/root.xml under ' + + root) + return 1 + + xml = args.pop(0) if args else None + if not xml or xml == '-': + emit = out + elif not xml.endswith('.xml'): + usage(name, err, 'Please use a .xml extension on your output file name, not ' + xml) + return 1 else: - yield '' - - # If nothing is left, StopIteration will avoid the warning: - tag = (tag if tag else tags.next(),) - sys.stderr.write('Ignoring unparsed cruft %s in %s\n' % ('_'.join(tag + tuple(tags)), name)) - -if len(sys.argv) != 2: - usage() - -cldr_dir = sys.argv[1] - -if not os.path.isdir(cldr_dir): - usage() - -cldr_files = os.listdir(cldr_dir) - -locale_database = {} - -# see http://www.unicode.org/reports/tr35/tr35-info.html#Default_Content -defaultContent_locales = [] -for ns in findTagsInFile(os.path.join(cldr_dir, '..', 'supplemental', - 'supplementalMetadata.xml'), - 'metadata/defaultContent'): - for data in ns[1:][0]: - if data[0] == u"locales": - defaultContent_locales += data[1].split() - -skips = [] -for file in defaultContent_locales: - try: - language_code, script_code, country_code = splitLocale(file) - except ValueError: - sys.stderr.write('skipping defaultContent locale "' + file + '" [neither two nor three tags]\n') - continue - - if not (script_code or country_code): - sys.stderr.write('skipping defaultContent locale "' + file + '" [second tag is neither script nor territory]\n') - continue - - try: - l = _generateLocaleInfo(cldr_dir + "/" + file + ".xml", language_code, script_code, country_code) - if not l: - skips.append(file) - continue - except xpathlite.Error as e: - sys.stderr.write('skipping defaultContent locale "%s" (%s)\n' % (file, str(e))) - continue - - locale_database[(l.language_id, l.script_id, l.country_id, l.variant_code)] = l - -if skips: - wrappedwarn('skipping defaultContent locales [no locale info generated]: ', skips) - skips = [] - -for file in cldr_files: - try: - l = generateLocaleInfo(cldr_dir + "/" + file) - if not l: - skips.append(file) - continue - except xpathlite.Error as e: - sys.stderr.write('skipping file "%s" (%s)\n' % (file, str(e))) - continue - - locale_database[(l.language_id, l.script_id, l.country_id, l.variant_code)] = l - -if skips: - wrappedwarn('skipping files [no locale info generated]: ', skips) - -integrateWeekData(cldr_dir+"/../supplemental/supplementalData.xml") -locale_keys = locale_database.keys() -locale_keys.sort() - -cldr_version = 'unknown' -ldml = open(cldr_dir+"/../dtd/ldml.dtd", "r") -for line in ldml: - if 'version cldrVersion CDATA #FIXED' in line: - cldr_version = line.split('"')[1] - -if sys.stdout.encoding != 'UTF-8' or (sys.stdout.encoding is None and sys.getdefaultencoding() != 'UTF-8'): - reload(sys) # Weirdly, this gets a richer sys module than the plain import got us ! - sys.setdefaultencoding('UTF-8') - -print "<localeDatabase>" -print " <version>" + cldr_version + "</version>" -print " <languageList>" -for id in enumdata.language_list: - l = enumdata.language_list[id] - print " <language>" - print " <name>" + l[0] + "</name>" - print " <id>" + str(id) + "</id>" - print " <code>" + l[1] + "</code>" - print " </language>" -print " </languageList>" - -print " <scriptList>" -for id in enumdata.script_list: - l = enumdata.script_list[id] - print " <script>" - print " <name>" + l[0] + "</name>" - print " <id>" + str(id) + "</id>" - print " <code>" + l[1] + "</code>" - print " </script>" -print " </scriptList>" - -print " <countryList>" -for id in enumdata.country_list: - l = enumdata.country_list[id] - print " <country>" - print " <name>" + l[0] + "</name>" - print " <id>" + str(id) + "</id>" - print " <code>" + l[1] + "</code>" - print " </country>" -print " </countryList>" - -def _parseLocale(l): - language = "AnyLanguage" - script = "AnyScript" - country = "AnyCountry" - - if l == "und": - raise xpathlite.Error("we are treating unknown locale like C") - - parsed = splitLocale(l) - language_code = parsed.next() - script_code = country_code = '' - try: - script_code, country_code = parsed - except ValueError: - pass - - if language_code != "und": - language_id = enumdata.languageCodeToId(language_code) - if language_id == -1: - raise xpathlite.Error('unknown language code "%s"' % language_code) - language = enumdata.language_list[language_id][0] - - if script_code: - script_id = enumdata.scriptCodeToId(script_code) - if script_id == -1: - raise xpathlite.Error('unknown script code "%s"' % script_code) - script = enumdata.script_list[script_id][0] - - if country_code: - country_id = enumdata.countryCodeToId(country_code) - if country_id == -1: - raise xpathlite.Error('unknown country code "%s"' % country_code) - country = enumdata.country_list[country_id][0] + try: + emit = open(xml, 'w') + except IOError as e: + usage(name, err, 'Failed to open "{}" to write output to it\n'.format(xml)) + return 1 - return (language, script, country) + if args: + usage(name, err, 'Too many arguments - excess: ' + ' '.join(args)) + return 1 -skips = [] -print " <likelySubtags>" -for ns in findTagsInFile(cldr_dir + "/../supplemental/likelySubtags.xml", "likelySubtags"): - tmp = {} - for data in ns[1:][0]: # ns looks like this: [u'likelySubtag', [(u'from', u'aa'), (u'to', u'aa_Latn_ET')]] - tmp[data[0]] = data[1] + if emit.encoding != 'UTF-8' or (emit.encoding is None and sys.getdefaultencoding() != 'UTF-8'): + reload(sys) # Weirdly, this gets a richer sys module than the plain import got us ! + sys.setdefaultencoding('UTF-8') - try: - from_language, from_script, from_country = _parseLocale(tmp[u"from"]) - to_language, to_script, to_country = _parseLocale(tmp[u"to"]) - except xpathlite.Error as e: - if tmp[u'to'].startswith(tmp[u'from']) and str(e) == 'unknown language code "%s"' % tmp[u'from']: - skips.append(tmp[u'to']) - else: - sys.stderr.write('skipping likelySubtag "%s" -> "%s" (%s)\n' % (tmp[u"from"], tmp[u"to"], str(e))) - continue - # substitute according to http://www.unicode.org/reports/tr35/#Likely_Subtags - if to_country == "AnyCountry" and from_country != to_country: - to_country = from_country - if to_script == "AnyScript" and from_script != to_script: - to_script = from_script + # TODO - command line options to tune choice of grumble and whitter: + reader = CldrReader(root, err.write, err.write) + writer = QLocaleXmlWriter(emit.write) - print " <likelySubtag>" - print " <from>" - print " <language>" + from_language + "</language>" - print " <script>" + from_script + "</script>" - print " <country>" + from_country + "</country>" - print " </from>" - print " <to>" - print " <language>" + to_language + "</language>" - print " <script>" + to_script + "</script>" - print " <country>" + to_country + "</country>" - print " </to>" - print " </likelySubtag>" -print " </likelySubtags>" -if skips: - wrappedwarn('skipping likelySubtags (for unknown language codes): ', skips) -print " <localeList>" + writer.version(reader.root.cldrVersion) + writer.enumData(language_list, script_list, country_list) + writer.likelySubTags(reader.likelySubTags()) + writer.locales(reader.readLocales(calendars), calendars) -Locale.C(calendars).toXml(calendars) -for key in locale_keys: - locale_database[key].toXml(calendars) + writer.close() + return 0 -print " </localeList>" -print "</localeDatabase>" +if __name__ == '__main__': + sys.exit(main(sys.argv, sys.stdout, sys.stderr)) diff --git a/util/locale_database/cldr2qtimezone.py b/util/locale_database/cldr2qtimezone.py index 4c3609056d..70b5d1e69e 100755 --- a/util/locale_database/cldr2qtimezone.py +++ b/util/locale_database/cldr2qtimezone.py @@ -1,7 +1,7 @@ #!/usr/bin/env python2 ############################################################################# ## -## Copyright (C) 2019 The Qt Company Ltd. +## Copyright (C) 2020 The Qt Company Ltd. ## Contact: https://www.qt.io/licensing/ ## ## This file is part of the test suite of the Qt Toolkit. @@ -34,59 +34,20 @@ the CLDR data. Pass its common/ directory as first parameter to this script and the qtbase root directory as second parameter. It shall update qtbase's src/corelib/time/qtimezoneprivate_data_p.h ready for use. - -The XML structure is as follows: - - <supplementalData> - <version number="$Revision:...$"/> - <generation date="$Date:...$"/> - <windowsZones> - <mapTimezones otherVersion="..." typeVersion="..."> - <!-- (UTC-08:00) Pacific Time (US & Canada) --> - <mapZone other="Pacific Standard Time" territory="001" type="America/Los_Angeles"/> - <mapZone other="Pacific Standard Time" territory="CA" type="America/Vancouver America/Dawson America/Whitehorse"/> - <mapZone other="Pacific Standard Time" territory="US" type="America/Los_Angeles America/Metlakatla"/> - <mapZone other="Pacific Standard Time" territory="ZZ" type="PST8PDT"/> - </mapTimezones> - </windowsZones> - </supplementalData> """ import os -import sys -import datetime -import tempfile -import enumdata -import xpathlite -from xpathlite import DraftResolution import re -import qlocalexml2cpp +import datetime +import textwrap -findAlias = xpathlite.findAlias -findEntry = xpathlite.findEntry -findEntryInFile = xpathlite._findEntryInFile -findTagsInFile = xpathlite.findTagsInFile -unicode2hex = qlocalexml2cpp.unicode2hex -wrap_list = qlocalexml2cpp.wrap_list +from localetools import unicode2hex, wrap_list, Error, SourceFileEditor +from cldr import CldrAccess -class ByteArrayData: - def __init__(self): - self.data = [] - self.hash = {} - def append(self, s): - s = s + '\0' - if s in self.hash: - return self.hash[s] +### Data that may need updates in response to new entries in the CLDR file ### - lst = unicode2hex(s) - index = len(self.data) - if index > 65535: - print "\n\n\n#error Data index is too big!" - sys.stderr.write ("\n\n\nERROR: index exceeds the uint16 range! index = %d\n" % index) - sys.exit(1) - self.hash[s] = index - self.data += lst - return index +# This script shall report the update you need, if this arises. +# However, you may need to research the relevant zone's standard offset. # List of currently known Windows IDs. # If this script reports missing IDs, please add them here. @@ -233,12 +194,6 @@ windowsIdList = ( (u'Yakutsk Standard Time', 32400), ) -def windowsIdToKey(windowsId): - for index, pair in enumerate(windowsIdList): - if pair[0] == windowsId: - return index + 1 - return 0 - # List of standard UTC IDs to use. Not public so may be safely changed. # Do not remove IDs, as each entry is part of the API/behavior guarantee. # ( UTC Id, Offset Seconds ) @@ -285,94 +240,43 @@ utcIdList = ( (u'UTC+14:00', 50400), ) -def usage(): - print "Usage: cldr2qtimezone.py <path to cldr core/common> <path to qtbase>" - sys.exit() - -if len(sys.argv) != 3: - usage() - -cldrPath = sys.argv[1] -qtPath = sys.argv[2] - -if not os.path.isdir(cldrPath) or not os.path.isdir(qtPath): - usage() - -windowsZonesPath = cldrPath + "/supplemental/windowsZones.xml" -tempFileDir = qtPath -dataFilePath = qtPath + "/src/corelib/time/qtimezoneprivate_data_p.h" - -if not (os.path.isfile(windowsZonesPath) and os.path.isfile(dataFilePath)): - usage() - -cldr_version = 'unknown' -ldml = open(cldrPath + "/dtd/ldml.dtd", "r") -for line in ldml: - if 'version cldrVersion CDATA #FIXED' in line: - cldr_version = line.split('"')[1] - -# [[u'version', [(u'number', u'$Revision: 7825 $')]]] -versionNumber = findTagsInFile(windowsZonesPath, "version")[0][1][0][1] - -mapTimezones = findTagsInFile(windowsZonesPath, "windowsZones/mapTimezones") - -defaultDict = {} -windowsIdDict = {} - -if mapTimezones: - badZones = set() - for mapZone in mapTimezones: - # [u'mapZone', [(u'territory', u'MH'), (u'other', u'UTC+12'), (u'type', u'Pacific/Majuro Pacific/Kwajalein')]] - if mapZone[0] == u'mapZone': - data = {} - for attribute in mapZone[1]: - if attribute[0] == u'other': - data['windowsId'] = attribute[1] - if attribute[0] == u'territory': - data['countryCode'] = attribute[1] - if attribute[0] == u'type': - data['ianaList'] = attribute[1] - - data['windowsKey'] = windowsIdToKey(data['windowsId']) - if data['windowsKey'] <= 0: - badZones.add(data['windowsId']) - - countryId = 0 - if data['countryCode'] == u'001': - defaultDict[data['windowsKey']] = data['ianaList'] - else: - data['countryId'] = enumdata.countryCodeToId(data['countryCode']) - if data['countryId'] < 0: - raise xpathlite.Error("Unknown Country Code \"%s\"" % data['countryCode']) - data['country'] = enumdata.country_list[data['countryId']][0] - windowsIdDict[data['windowsKey'], data['countryId']] = data - if badZones: - sys.stderr.write('\n\t'.join(["\nUnknown Windows ID, please add:"] + sorted(badZones)) - + "\nto the windowIdList in cldr2qtimezone.py\n\n") - raise xpathlite.Error("Unknown Windows IDs") - -print "Input file parsed, now writing data" - -GENERATED_BLOCK_START = "// GENERATED PART STARTS HERE\n" -GENERATED_BLOCK_END = "// GENERATED PART ENDS HERE\n" - -# Create a temp file to write the new data into -(newTempFile, newTempFilePath) = tempfile.mkstemp("qtimezone_data_p", dir=tempFileDir) -newTempFile = os.fdopen(newTempFile, "w") - -# Open the old file and copy over the first non-generated section to the new file -oldDataFile = open(dataFilePath, "r") -s = oldDataFile.readline() -while s and s != GENERATED_BLOCK_START: - newTempFile.write(s) - s = oldDataFile.readline() - -# Write out generated block start tag and warning -newTempFile.write(GENERATED_BLOCK_START) -newTempFile.write(""" +### End of data that may need updates in response to CLDR ### + +class ByteArrayData: + def __init__(self): + self.data = [] + self.hash = {} + + def append(self, s): + s = s + '\0' + if s in self.hash: + return self.hash[s] + + lst = unicode2hex(s) + index = len(self.data) + if index > 0xffff: + raise Error('Index ({}) outside the uint16 range !'.format(index)) + self.hash[s] = index + self.data += lst + return index + + def write(self, out, name): + out('\nstatic const char {}[] = {{\n'.format(name)) + out(wrap_list(self.data)) + out('\n};\n') + +class ZoneIdWriter (SourceFileEditor): + def write(self, version, defaults, windowsIds): + self.__writeWarning(version) + windows, iana = self.__writeTables(self.writer.write, defaults, windowsIds) + windows.write(self.writer.write, 'windowsIdData') + iana.write(self.writer.write, 'ianaIdData') + + def __writeWarning(self, version): + self.writer.write(""" /* - This part of the file was generated on %s from the - Common Locale Data Repository v%s supplemental/windowsZones.xml file %s + This part of the file was generated on {} from the + Common Locale Data Repository v{} file supplemental/windowsZones.xml http://www.unicode.org/cldr/ @@ -380,80 +284,111 @@ newTempFile.write(""" edited) CLDR data; see qtbase/util/locale_database/. */ -""" % (str(datetime.date.today()), cldr_version, versionNumber) ) - -windowsIdData = ByteArrayData() -ianaIdData = ByteArrayData() - -# Write Windows/IANA table -newTempFile.write("// Windows ID Key, Country Enum, IANA ID Index\n") -newTempFile.write("static const QZoneData zoneDataTable[] = {\n") -for index in sorted(windowsIdDict): - data = windowsIdDict[index] - newTempFile.write(" { %6d,%6d,%6d }, // %s / %s\n" - % (data['windowsKey'], - data['countryId'], - ianaIdData.append(data['ianaList']), - data['windowsId'], - data['country'])) -newTempFile.write(" { 0, 0, 0 } // Trailing zeroes\n") -newTempFile.write("};\n\n") - -print "Done Zone Data" - -# Write Windows ID key table -newTempFile.write("// Windows ID Key, Windows ID Index, IANA ID Index, UTC Offset\n") -newTempFile.write("static const QWindowsData windowsDataTable[] = {\n") -for index, pair in enumerate(windowsIdList): - newTempFile.write(" { %6d,%6d,%6d,%6d }, // %s\n" - % (index + 1, windowsIdData.append(pair[0]), - ianaIdData.append(defaultDict[index + 1]), pair[1], pair[0])) -newTempFile.write(" { 0, 0, 0, 0 } // Trailing zeroes\n") -newTempFile.write("};\n\n") - -print "Done Windows Data Table" - -# Write UTC ID key table -newTempFile.write("// IANA ID Index, UTC Offset\n") -newTempFile.write("static const QUtcData utcDataTable[] = {\n") -for pair in utcIdList: - newTempFile.write(" { %6d,%6d }, // %s\n" - % (ianaIdData.append(pair[0]), pair[1], pair[0])) -newTempFile.write(" { 0, 0 } // Trailing zeroes\n") -newTempFile.write("};\n\n") - -print "Done UTC Data Table" - -# Write out Windows ID's data -newTempFile.write("static const char windowsIdData[] = {\n") -newTempFile.write(wrap_list(windowsIdData.data)) -newTempFile.write("\n};\n\n") - -# Write out IANA ID's data -newTempFile.write("static const char ianaIdData[] = {\n") -newTempFile.write(wrap_list(ianaIdData.data)) -newTempFile.write("\n};\n") - -print "Done ID Data Table" - -# Write out the end of generated block tag -newTempFile.write(GENERATED_BLOCK_END) -s = oldDataFile.readline() - -# Skip through the old generated data in the old file -while s and s != GENERATED_BLOCK_END: - s = oldDataFile.readline() - -# Now copy the rest of the original file into the new file -s = oldDataFile.readline() -while s: - newTempFile.write(s) - s = oldDataFile.readline() - -# Now close the old and new file, delete the old file and copy the new file in its place -newTempFile.close() -oldDataFile.close() -os.remove(dataFilePath) -os.rename(newTempFilePath, dataFilePath) - -print "Data generation completed, please check the new file at " + dataFilePath +""".format(str(datetime.date.today()), version)) + + @staticmethod + def __writeTables(out, defaults, windowsIds): + windowsIdData, ianaIdData = ByteArrayData(), ByteArrayData() + + # Write Windows/IANA table + out('// Windows ID Key, Country Enum, IANA ID Index\n') + out('static const QZoneData zoneDataTable[] = {\n') + for index, data in sorted(windowsIds.items()): + out(' {{ {:6d},{:6d},{:6d} }}, // {} / {}\n'.format( + data['windowsKey'], data['countryId'], + ianaIdData.append(data['ianaList']), + data['windowsId'], data['country'])) + out(' { 0, 0, 0 } // Trailing zeroes\n') + out('};\n\n') + + # Write Windows ID key table + out('// Windows ID Key, Windows ID Index, IANA ID Index, UTC Offset\n') + out('static const QWindowsData windowsDataTable[] = {\n') + for index, pair in enumerate(windowsIdList, 1): + out(' {{ {:6d},{:6d},{:6d},{:6d} }}, // {}\n'.format( + index, + windowsIdData.append(pair[0]), + ianaIdData.append(defaults[index]), + pair[1], pair[0])) + out(' { 0, 0, 0, 0 } // Trailing zeroes\n') + out('};\n\n') + + # Write UTC ID key table + out('// IANA ID Index, UTC Offset\n') + out('static const QUtcData utcDataTable[] = {\n') + for pair in utcIdList: + out(' {{ {:6d},{:6d} }}, // {}\n'.format( + ianaIdData.append(pair[0]), pair[1], pair[0])) + out(' { 0, 0 } // Trailing zeroes\n') + out('};\n') + + return windowsIdData, ianaIdData + +def usage(err, name, message=''): + err.write("""Usage: {} path/to/cldr/core/common path/to/qtbase +""".format(name)) # TODO: more interesting message + if message: + err.write('\n' + message + '\n') + +def main(args, out, err): + """Parses CLDR's data and updates Qt's representation of it. + + Takes sys.argv, sys.stdout, sys.stderr (or equivalents) as + arguments. Expects two command-line options: the root of the + unpacked CLDR data-file tree and the root of the qtbase module's + checkout. Updates QTimeZone's private data about Windows time-zone + IDs.""" + name = args.pop(0) + if len(args) != 2: + usage(err, name, "Expected two arguments") + return 1 + + cldrPath = args.pop(0) + qtPath = args.pop(0) + + if not os.path.isdir(qtPath): + usage(err, name, "No such Qt directory: " + qtPath) + return 1 + if not os.path.isdir(cldrPath): + usage(err, name, "No such CLDR directory: " + cldrPath) + return 1 + + dataFilePath = os.path.join(qtPath, 'src', 'corelib', 'time', 'qtimezoneprivate_data_p.h') + if not os.path.isfile(dataFilePath): + usage(err, name, 'No such file: ' + dataFilePath) + return 1 + + try: + version, defaults, winIds = CldrAccess(cldrPath).readWindowsTimeZones( + dict((name, ind) for ind, name in enumerate((x[0] for x in windowsIdList), 1))) + except IOError as e: + usage(err, name, + 'Failed to open common/supplemental/windowsZones.xml: ' + (e.message or e.args[1])) + return 1 + except Error as e: + err.write('\n'.join(textwrap.wrap( + 'Failed to read windowsZones.xml: ' + (e.message or e.args[1]), + subsequent_indent=' ', width=80)) + '\n') + return 1 + + out.write('Input file parsed, now writing data\n') + try: + writer = ZoneIdWriter(dataFilePath, qtPath) + except IOError as e: + err.write('Failed to open files to transcribe: {}'.format(e.message or e.args[1])) + return 1 + + try: + writer.write(version, defaults, winIds) + except Error as e: + writer.cleanup() + err.write('\nError in Windows ID data: ' + e.message + '\n') + return 1 + + writer.close() + out.write('Data generation completed, please check the new file at ' + dataFilePath + '\n') + return 0 + +if __name__ == '__main__': + import sys + sys.exit(main(sys.argv, sys.stdout, sys.stderr)) diff --git a/util/locale_database/ldml.py b/util/locale_database/ldml.py new file mode 100644 index 0000000000..e3e3a2e4ba --- /dev/null +++ b/util/locale_database/ldml.py @@ -0,0 +1,589 @@ +############################################################################# +## +## Copyright (C) 2020 The Qt Company Ltd. +## Contact: https://www.qt.io/licensing/ +## +## This file is part of the test suite of the Qt Toolkit. +## +## $QT_BEGIN_LICENSE:GPL-EXCEPT$ +## Commercial License Usage +## Licensees holding valid commercial Qt licenses may use this file in +## accordance with the commercial license agreement provided with the +## Software or, alternatively, in accordance with the terms contained in +## a written agreement between you and The Qt Company. For licensing terms +## and conditions see https://www.qt.io/terms-conditions. For further +## information use the contact form at https://www.qt.io/contact-us. +## +## GNU General Public License Usage +## Alternatively, this file may be used under the terms of the GNU +## General Public License version 3 as published by the Free Software +## Foundation with exceptions as appearing in the file LICENSE.GPL3-EXCEPT +## included in the packaging of this file. Please review the following +## information to ensure the GNU General Public License requirements will +## be met: https://www.gnu.org/licenses/gpl-3.0.html. +## +## $QT_END_LICENSE$ +## +############################################################################# +"""Parsing the Locale Data Markup Language + +It's an XML format, so the raw parsing of XML is, of course, delegated +to xml.dom.minidom; but it has its own specific schemata and some +funky rules for combining data from various files (inheritance between +locales). The use of it we're interested in is extraction of CLDR's +data, so some of the material here is specific to CLDR; see cldr.py +for how it is mainly used. + +Provides various classes to wrap xml.dom's objects, specifically those +returned by minidom.parse() and their child-nodes: + Node -- wraps any node in the DOM tree + XmlScanner -- wraps the root element of a stand-alone XML file + Supplement -- specializes XmlScanner for supplemental data files + LocaleScanner -- wraps a locale's inheritance-chain of file roots + +See individual classes for further detail. +""" +from localetools import Error +from dateconverter import convert_date + +class Node (object): + """Wrapper for an arbitrary DOM node. + + Provides various ways to select chldren of a node. Selected child + nodes are returned wrapped as Node objects. A Node exposes the + raw DOM node it wraps via its .dom attribute.""" + + def __init__(self, elt, dullAttrs = None, draft = 0): + """Wraps a DOM node for ease of access. + + First argument, elt, is the DOM node to wrap. + + Optional second argument, dullAttrs, should either be None or + map each LDML tag name to a list of the names of + non-distinguishing attributes for nodes with the given tag + name. If None is given, no distinguishing attribute checks are + performed. + + (Optional third argument, draft, should only be supplied by + this class's creation of child nodes; it is the maximum draft + score of any ancestor of the new node.)""" + self.dom, self.__dull = elt, dullAttrs + try: + attr = elt.attributes['draft'].nodeValue + except KeyError: + self.draft = draft + else: + self.draft = max(draft, self.draftScore(attr)) + + def findAllChildren(self, tag, wanted = None, allDull = False): + """All children that do have the given tag and attributes. + + First argument is the tag: children with any other tag are + ignored. + + Optional second argument, wanted, should either be None or map + attribute names to the values they must have. Only child nodes + with thes attributes set to the given values are yielded. + + By default, nodes that have distinguishing attributes, other + than those specified in wanted, are ignored. Pass the allDull + parameter a true value to suppress this check.""" + + if self.__dull is None: + allDull = True + dull = () if allDull else self.__dull[tag] + + for child in self.dom.childNodes: + if child.nodeType != child.ELEMENT_NODE: + continue + if child.nodeName != tag: + continue + + if wanted: + try: + if any(child.attributes[k].nodeValue != v + for k, v in wanted.items()): + continue + except KeyError: # Some wanted attribute is missing + continue + + if not (allDull or all(k in dull or k in wanted + for k in child.attributes.keys())): + continue + + elif not (allDull or all(k in dull + for k in child.attributes.keys())): + continue + + yield Node(child, self.__dull, self.draft) + + def findUniqueChild(self, tag): + """Returns the single child with the given nodeName. + + Raises Error if there is no such child or there is more than + one.""" + seq = self.findAllChildren(tag) + try: + node = seq.next() + except StopIteration: + raise Error('No child found where one was expected', tag) + for it in seq: + raise Error('Many children found where only one was expected', tag) + return node + + @classmethod + def draftScore(cls, level): + """Maps draft level names to numeric scores. + + Single parameter, level, is the least sure value of the draft + attribute on a node that you're willing to accept; returns a + numeric value (lower is less drafty). + + Tempting as it is to insist on low draft scores, there are + many locales in which pretty much every leaf is + unconfirmed. It may make sense to actually check each + XmlScanner object, or each node in each LocaleScanner's nodes + list, to see what its distribution of draft level looks like, + so as to set the acceptable draft score for its elements + accordingly. However, for the moment, we mostly just accept + all elements, regardless of draft values (the one exception is + am/pm indicators).""" + return cls.__draftScores.get(level, 5) if level else 0 + + # Implementation details: + __draftScores = dict(true = 4, unconfirmed = 3, provisional = 2, + contributed = 1, approved = 0, false = 0) + +def _parseXPath(selector): + # Split "tag[attr=val][...]" into tag-name and attribute mapping + attrs = selector.split('[') + name = attrs.pop(0) + if attrs: + attrs = [x.strip() for x in attrs] + assert all(x.endswith(']') for x in attrs) + attrs = [x[:-1].split('=') for x in attrs] + assert all(len(x) in (1, 2) for x in attrs) + attrs = (('type', x[0]) if len(x) == 1 else x for x in attrs) + return name, dict(attrs) + +def _iterateEach(iters): + # Flatten a two-layer iterator. + for it in iters: + for item in it: + yield item + +class XmlScanner (object): + """Wrap an XML file to enable XPath access to its nodes. + """ + def __init__(self, node): + self.root = node + + def findNodes(self, xpath): + """Return all nodes under self.root matching this xpath. + + Ignores any excess attributes.""" + elts = (self.root,) + for selector in xpath.split('/'): + tag, attrs = _parseXPath(selector) + elts = tuple(_iterateEach(e.findAllChildren(tag, attrs) for e in elts)) + if not elts: + break + return elts + +class Supplement (XmlScanner): + def find(self, xpath): + elts = self.findNodes(xpath) + for elt in _iterateEach(e.dom.childNodes if e.dom.childNodes else (e.dom,) + for e in elts): + if elt.attributes: + yield (elt.nodeName, + dict((k, v if isinstance(v, basestring) else v.nodeValue) + for k, v in elt.attributes.items())) + +class LocaleScanner (object): + def __init__(self, name, nodes, root): + self.name, self.nodes, self.base = name, nodes, root + + def find(self, xpath, default = None, draft = None): + """XPath search for the content of an element. + + Required argument, xpath, is the XPath to search for. Optional + second argument is a default value to use, if no such node is + found. Optional third argument is a draft score (see + Node.draftScore() for details); if given, leaf elements with + higher draft scores are ignored.""" + try: + for elt in self.__find(xpath): + try: + if draft is None or elt.draft <= draft: + return elt.dom.firstChild.nodeValue + except (AttributeError, KeyError): + pass + except Error as e: + if default is None: + raise + return default + + def tagCodes(self): + """Yields four tag codes + + The tag codes are language, script, country and variant; an + empty value for any of them indicates that no value was + provided. The values are obtained from the primary file's + top-level <identity> element. An Error is raised if any + top-level <alias> element of this file has a non-empty source + attribute; that attribute value is mentioned in the error's + message.""" + root = self.nodes[0] + for alias in root.findAllChildren('alias', allDull=True): + try: + source = alias.dom.attributes['source'].nodeValue + except (KeyError, AttributeError): + pass + else: + raise Error('Alias to {}'.format(source)) + + ids = root.findUniqueChild('identity') + for code in ('language', 'script', 'territory', 'variant'): + for node in ids.findAllChildren(code, allDull=True): + try: + yield node.dom.attributes['type'].nodeValue + except (KeyError, AttributeError): + pass + else: + break # only want one value for each code + else: # No value for this code, use empty + yield '' + + def currencyData(self, isoCode): + """Fetches currency data for this locale. + + Single argument, isoCode, is the ISO currency code for the + currency in use in the country. See also numericData, which + includes some currency formats. + """ + if isoCode: + stem = 'numbers/currencies/currency[{}]/'.format(isoCode) + symbol = self.find(stem + 'symbol', '') + displays = tuple(self.find(stem + 'displayName' + tail, '') + for tail in ('',) + tuple( + '[count={}]'.format(x) for x in ('zero', 'one', 'two', + 'few', 'many', 'other'))) + while displays and not displays[-1]: + displays = displays[:-1] + name = ';'.join(displays) + else: + symbol = name = '' + yield 'currencySymbol', symbol + yield 'currencyDisplayName', name + + def numericData(self, lookup, complain = lambda text: None): + """Generate assorted numeric data for the locale. + + First argument, lookup, is a callable that maps a numbering + system's name to certain data about the system, as a mapping; + we expect this to have u'digits' as a key. + """ + system = self.find('numbers/defaultNumberingSystem') + stem = 'numbers/symbols[numberSystem={}]/'.format(system) + decimal = self.find(stem + 'decimal') + group = self.find(stem + 'group') + assert decimal != group, (self.name, system, decimal) + yield 'decimal', decimal + yield 'group', group + yield 'percent', self.find(stem + 'percentSign') + yield 'list', self.find(stem + 'list') + yield 'exp', self.find(stem + 'exponential') + + digits = lookup(system)['digits'] + assert len(digits) == 10 + zero = digits[0] + # Qt's number-formatting code assumes digits are consecutive: + assert all(ord(c) == i for i, c in enumerate(digits, ord(zero))) + yield 'zero', zero + + plus = self.find(stem + 'plusSign') + minus = self.find(stem + 'minusSign') + yield 'plus', plus + yield 'minus', minus + + # Currency formatting: + xpath = 'numbers/currencyFormats/currencyFormatLength/currencyFormat[accounting]/pattern' + try: + money = self.find(xpath.replace('Formats/', + 'Formats[numberSystem={}]/'.format(system))) + except Error: + money = self.find(xpath) + money = self.__currencyFormats(money, plus, minus) + yield 'currencyFormat', money.next() + neg = '' + for it in money: + assert not neg, 'There should be at most one more pattern' + neg = it + yield 'currencyNegativeFormat', neg + + def textPatternData(self): + for key in ('quotationStart', 'alternateQuotationEnd', + 'quotationEnd', 'alternateQuotationStart'): + yield key, self.find('delimiters/' + key) + + for key in ('start', 'middle', 'end'): + yield ('listPatternPart' + key.capitalize(), + self.__fromLdmlListPattern(self.find( + 'listPatterns/listPattern/listPatternPart[{}]'.format(key)))) + yield ('listPatternPartTwo', + self.__fromLdmlListPattern(self.find( + 'listPatterns/listPattern/listPatternPart[2]'))) + + stem = 'dates/calendars/calendar[gregorian]/' + # TODO: is wide really the right width to use here ? + # abbreviated might be an option ... or try both ? + meridiem = stem + 'dayPeriods/dayPeriodContext[format]/dayPeriodWidth[wide]/' + for key in ('am', 'pm'): + yield key, self.find(meridiem + 'dayPeriod[{}]'.format(key), + draft = Node.draftScore('contributed')) + + for pair in (('long', 'full'), ('short', 'short')): + for key in ('time', 'date'): + yield (pair[0] + key.capitalize() + 'Format', + convert_date(self.find( + stem + '{}Formats/{}FormatLength[{}]/{}Format/pattern'.format( + key, key, pair[1], key)))) + + def endonyms(self, language, script, country, variant): + # TODO: take variant into account ? + for seq in ((language, script, country), + (language, script), (language, country), (language,)): + if not all(seq): + continue + try: + yield ('languageEndonym', + self.find('localeDisplayNames/languages/language[{}]' + .format('_'.join(seq)))) + except Error: + pass + else: + break + else: + # grumble(failed to find endonym for language) + yield 'languageEndonym', '' + + yield ('countryEndonym', + self.find('localeDisplayNames/territories/territory[{}]' + .format(country), '')) + + def unitData(self): + yield ('byte_unit', + self.find('units/unitLength[long]/unit[digital-byte]/displayName', + 'bytes')) + + unit = self.__findUnit('', 'B') + cache = [] # Populated by the SI call, to give hints to the IEC call + yield ('byte_si_quantified', + ';'.join(self.__unitCount('', unit, cache))) + # IEC 60027-2 + # http://physics.nist.gov/cuu/Units/binary.html + yield ('byte_iec_quantified', + ';'.join(self.__unitCount('bi', 'iB', cache))) + + def calendarNames(self, calendars): + namings = self.__nameForms + for cal in calendars: + stem = 'dates/calendars/calendar[' + cal + ']/months/' + for key, mode, size in namings: + prop = 'monthContext[' + mode + ']/monthWidth[' + size + ']/' + yield (key + 'Months_' + cal, + ';'.join(self.find(stem + prop + 'month[{}]'.format(i)) + for i in range(1, 13))) + + # Day data (for Gregorian, at least): + stem = 'dates/calendars/calendar[gregorian]/days/' + days = ('sun', 'mon', 'tue', 'wed', 'thu', 'fri', 'sat') + for (key, mode, size) in namings: + prop = 'dayContext[' + mode + ']/dayWidth[' + size + ']/day' + yield (key + 'Days', + ';'.join(self.find(stem + prop + '[' + day + ']') + for day in days)) + + # Implementation details + __nameForms = ( + ('standaloneLong', 'stand-alone', 'wide'), + ('standaloneShort', 'stand-alone', 'abbreviated'), + ('standaloneNarrow', 'stand-alone', 'narrow'), + ('long', 'format', 'wide'), + ('short', 'format', 'abbreviated'), + ('narrow', 'format', 'narrow'), + ) # Used for month and day names + + def __find(self, xpath): + retries = [ xpath.split('/') ] + while retries: + tags, elts, roots = retries.pop(), self.nodes, (self.base.root,) + for selector in tags: + tag, attrs = _parseXPath(selector) + elts = tuple(_iterateEach(e.findAllChildren(tag, attrs) for e in elts)) + if not elts: + break + + else: # Found matching elements + # Possibly filter elts to prefer the least drafty ? + for elt in elts: + yield elt + + # Process roots separately: otherwise the alias-processing + # is excessive. + for i, selector in enumerate(tags): + tag, attrs = _parseXPath(selector) + + for alias in tuple(_iterateEach(r.findAllChildren('alias', allDull=True) + for r in roots)): + if alias.dom.attributes['source'].nodeValue == 'locale': + replace = alias.dom.attributes['path'].nodeValue.split('/') + retries.append(self.__xpathJoin(tags[:i], replace, tags[i:])) + + roots = tuple(_iterateEach(r.findAllChildren(tag, attrs) for r in roots)) + if not roots: + if retries: # Let outer loop fall back on an alias path: + break + sought = '/'.join(tags) + if sought != xpath: + sought += ' (for {})'.format(xpath) + raise Error('All lack child {} for {} in {}'.format( + selector, sought, self.name)) + + else: # Found matching elements + for elt in roots: + yield elt + + sought = '/'.join(tags) + if sought != xpath: + sought += ' (for {})'.format(xpath) + raise Error('No {} in {}'.format(sought, self.name)) + + def __findUnit(self, keySuffix, quantify, fallback=''): + # The displayName for a quantified unit in en.xml is kByte + # (even for unitLength[narrow]) instead of kB (etc.), so + # prefer any unitPattern provided, but prune its placeholder: + for size in ('short', 'narrow'): # TODO: reverse order ? + stem = 'units/unitLength[{}]/unit[digital-{}byte]/'.format(size + keySuffix, quantify) + for count in ('many', 'few', 'two', 'other', 'zero', 'one'): + try: + ans = self.find(stem + 'unitPattern[count={}]'.format(count)) + except Error: + continue + + # TODO: do count-handling, instead of discarding placeholders + if False: # TODO: do it this way, instead ! + ans = ans.replace('{0}', '').strip() + elif ans.startswith('{0}'): + ans = ans[3:].lstrip() + if ans: + return ans + + try: + return self.find(stem + 'displayName') + except Error: + pass + + return fallback + + def __unitCount(self, keySuffix, suffix, cache, + # Stop at exa/exbi: 16 exbi = 2^{64} < zetta = + # 1000^7 < zebi = 2^{70}, the next quantifiers up: + siQuantifiers = ('kilo', 'mega', 'giga', 'tera', 'peta', 'exa')): + """Work out the unit quantifiers. + + Unfortunately, the CLDR data only go up to terabytes and we + want all the way to exabytes; but we can recognize the SI + quantifiers as prefixes, strip and identify the tail as the + localized translation for 'B' (e.g. French has 'octet' for + 'byte' and uses ko, Mo, Go, To from which we can extrapolate + Po, Eo). + + Should be called first for the SI quantifiers, with suffix = + 'B', then for the IEC ones, with suffix = 'iB'; the list cache + (initially empty before first call) is used to let the second + call know what the first learned about the localized unit. + """ + if suffix == 'iB': # second call, re-using first's cache + if cache: + byte = cache.pop() + if all(byte == k for k in cache): + suffix = 'i' + byte + for q in siQuantifiers: + # Those don't (yet, v36) exist in CLDR, so we always get the fall-back: + yield self.__findUnit(keySuffix, q[:2], q[0].upper() + suffix) + else: # first call + tail = suffix = suffix or 'B' + for q in siQuantifiers: + it = self.__findUnit(keySuffix, q) + # kB for kilobyte, in contrast with KiB for IEC: + q = q[0] if q == 'kilo' else q[0].upper() + if not it: + it = q + tail + elif it.startswith(q): + rest = it[1:] + tail = rest if all(rest == k for k in cache) else suffix + cache.append(rest) + yield it + + @staticmethod + def __currencyFormats(patterns, plus, minus): + for p in patterns.split(';'): + p = p.replace('0', '#').replace(',', '').replace('.', '') + try: + cut = p.find('#') + 1 + except ValueError: + pass + else: + p = p[:cut] + p[cut:].replace('#', '') + p = p.replace('#', "%1") + # According to http://www.unicode.org/reports/tr35/#Number_Format_Patterns + # there can be doubled or trippled currency sign, however none of the + # locales use that. + p = p.replace(u'\xa4', "%2") + # Single quote goes away, but double goes to single: + p = p.replace("''", '###').replace("'", '').replace('###', "'") + # Use number system's signs: + p = p.replace('+', plus).replace('-', minus) + yield p + + @staticmethod + def __fromLdmlListPattern(pattern): + # This is a very limited parsing of the format for list pattern part only. + return pattern.replace('{0}', '%1').replace('{1}', '%2').replace('{2}', '%3') + + @staticmethod + def __fromLdmlPath(seq): # tool function for __xpathJoin() + """Convert LDML's [@name='value'] to our [name=value] form.""" + for it in seq: + # First dismember it: + attrs = it.split('[') + tag = attrs.pop(0) + if not attrs: # Short-cut the easy case: + yield it + continue + + assert all(x.endswith(']') for x in attrs) + attrs = [x[:-1].split('=') for x in attrs] + # Then fix each attribute specification in it: + attrs = [(x[0][1:] if x[0].startswith('@') else x[0], + x[1][1:-1] if x[1].startswith("'") and x[1].endswith("'") else x[1]) + for x in attrs] + # Finally, put it all back together: + attrs = ['='.join(x) + ']' for x in attrs] + attrs.insert(0, tag) + yield '['.join(attrs) + + @classmethod + def __xpathJoin(cls, head, insert, tail): + """Join three lists of XPath selectors. + + Each of head, insert and tail is a sequence of selectors but + insert may start with some uses of '..', that we want to + resolve away, and may use LDML's attribute format, that we + want to convert to our format.""" + while insert and insert[0] == '..': + insert.pop(0) + head.pop() + return head + list(cls.__fromLdmlPath(insert)) + tail diff --git a/util/locale_database/localetools.py b/util/locale_database/localetools.py new file mode 100644 index 0000000000..29153366b3 --- /dev/null +++ b/util/locale_database/localetools.py @@ -0,0 +1,164 @@ +############################################################################# +## +## Copyright (C) 2020 The Qt Company Ltd. +## Contact: https://www.qt.io/licensing/ +## +## This file is part of the test suite of the Qt Toolkit. +## +## $QT_BEGIN_LICENSE:GPL-EXCEPT$ +## Commercial License Usage +## Licensees holding valid commercial Qt licenses may use this file in +## accordance with the commercial license agreement provided with the +## Software or, alternatively, in accordance with the terms contained in +## a written agreement between you and The Qt Company. For licensing terms +## and conditions see https://www.qt.io/terms-conditions. For further +## information use the contact form at https://www.qt.io/contact-us. +## +## GNU General Public License Usage +## Alternatively, this file may be used under the terms of the GNU +## General Public License version 3 as published by the Free Software +## Foundation with exceptions as appearing in the file LICENSE.GPL3-EXCEPT +## included in the packaging of this file. Please review the following +## information to ensure the GNU General Public License requirements will +## be met: https://www.gnu.org/licenses/gpl-3.0.html. +## +## $QT_END_LICENSE$ +## +############################################################################# +"""Utilities shared among the CLDR extraction tools. + +Functions: + unicode2hex() -- converts unicode text to UCS-2 in hex form. + wrap_list() -- map list to comma-separated string, 20 entries per line. + +Classes: + Error -- A shared error class. + Transcriber -- edit a file by writing a temporary file, then renaming. + SourceFileEditor -- adds standard prelude and tail handling to Transcriber. +""" + +import os +import tempfile + +class Error (StandardError): + __upinit = StandardError.__init__ + def __init__(self, msg, *args): + self.__upinit(msg, *args) + self.message = msg + def __str__(self): + return self.message + +def unicode2hex(s): + lst = [] + for x in s: + v = ord(x) + if v > 0xFFFF: + # make a surrogate pair + # copied from qchar.h + high = (v >> 10) + 0xd7c0 + low = (v % 0x400 + 0xdc00) + lst.append(hex(high)) + lst.append(hex(low)) + else: + lst.append(hex(v)) + return lst + +def wrap_list(lst): + def split(lst, size): + while lst: + head, lst = lst[:size], lst[size:] + yield head + return ",\n".join(", ".join(x) for x in split(lst, 20)) + +class Transcriber (object): + """Helper class to facilitate rewriting source files. + + This class takes care of the temporary file manipulation. Derived + classes need to implement transcribing of the content, with + whatever modifications they may want. Members reader and writer + are exposed; use writer.write() to output to the new file; use + reader.readline() or iterate reader to read the original. + + Callers should call close() on success or cleanup() on failure (to + clear away the temporary file). + """ + def __init__(self, path, temp): + # Open the old file + self.reader = open(path) + # Create a temp file to write the new data into + temp, tempPath = tempfile.mkstemp(os.path.split(path)[1], dir = temp) + self.__names = path, tempPath + self.writer = os.fdopen(temp, "w") + + def close(self): + self.reader.close() + self.writer.close() + self.reader = self.writer = None + source, temp = self.__names + os.remove(source) + os.rename(temp, source) + + def cleanup(self): + if self.__names: + self.reader.close() + self.writer.close() + # Remove temp-file: + os.remove(self.__names[1]) + self.__names = () + +class SourceFileEditor (Transcriber): + """Transcriber with transcription of code around a gnerated block. + + We have a common pattern of source files with a generated part + embedded in a context that's not touched by the regeneration + scripts. The generated part is, in each case, marked with a common + pair of start and end markers. We transcribe the old file to a new + temporary file; on success, we then remove the original and move + the new version to replace it. + + This class takes care of transcribing the parts before and after + the generated content; on creation, an instance will copy the + preamble up to the start marker; its close() will skip over the + original's generated content and resume transcribing with the end + marker. Derived classes need only implement the generation of the + content in between. + + Callers should call close() on success or cleanup() on failure (to + clear away the temporary file); see Transcriber. + """ + __upinit = Transcriber.__init__ + def __init__(self, path, temp): + """Set up the source file editor. + + Requires two arguments: the path to the source file to be read + and, on success, replaced with a new version; and the + directory in which to store the temporary file during the + rewrite.""" + self.__upinit(path, temp) + self.__copyPrelude() + + __upclose = Transcriber.close + def close(self): + self.__copyTail() + self.__upclose() + + # Implementation details: + GENERATED_BLOCK_START = '// GENERATED PART STARTS HERE' + GENERATED_BLOCK_END = '// GENERATED PART ENDS HERE' + + def __copyPrelude(self): + # Copy over the first non-generated section to the new file + for line in self.reader: + self.writer.write(line) + if line.strip() == self.GENERATED_BLOCK_START: + break + + def __copyTail(self): + # Skip through the old generated data in the old file + for line in self.reader: + if line.strip() == self.GENERATED_BLOCK_END: + self.writer.write(line) + break + # Transcribe the remainder: + for line in self.reader: + self.writer.write(line) diff --git a/util/locale_database/qlocalexml.py b/util/locale_database/qlocalexml.py index 0a4628e05e..550021ba01 100644 --- a/util/locale_database/qlocalexml.py +++ b/util/locale_database/qlocalexml.py @@ -28,11 +28,18 @@ ############################################################################# """Shared serialization-scanning code for QLocaleXML format. -The Locale class is written by cldr2qlocalexml.py and read by qlocalexml2cpp.py +Provides classes: + Locale -- common data-type representing one locale as a namespace + QLocaleXmlWriter -- helper to write a QLocaleXML file + QLocaleXmlReader -- helper to read a QLocaleXML file back in + +Support: + Spacer -- provides control over indentation of the output. """ +from __future__ import print_function from xml.sax.saxutils import escape -import xpathlite +from localetools import Error # Tools used by Locale: def camel(seq): @@ -43,6 +50,10 @@ def camel(seq): def camelCase(words): return ''.join(camel(iter(words))) +def addEscapes(s): + return ''.join(c if n < 128 else '\\x{:02x}'.format(n) + for n, c in ((ord(c), c) for c in s)) + def startCount(c, text): # strspn """First index in text where it doesn't have a character in c""" assert text and text[0] in c @@ -58,6 +69,8 @@ def convertFormat(format): * https://www.unicode.org/reports/tr35/tr35-dates.html#Date_Field_Symbol_Table * QDateTimeParser::parseFormat() and QLocalePrivate::dateTimeToString() """ + # Compare and contrast dateconverter.py's convert_date(). + # Need to (check consistency and) reduce redundancy ! result = "" i = 0 while i < len(format): @@ -102,7 +115,314 @@ def convertFormat(format): return result -class Locale: +class QLocaleXmlReader (object): + def __init__(self, filename): + self.root = self.__parse(filename) + # Lists of (id, name, code) triples: + languages = tuple(self.__loadMap('language')) + scripts = tuple(self.__loadMap('script')) + countries = tuple(self.__loadMap('country')) + self.__likely = tuple(self.__likelySubtagsMap()) + # Mappings {ID: (name, code)} + self.languages = dict((v[0], v[1:]) for v in languages) + self.scripts = dict((v[0], v[1:]) for v in scripts) + self.countries = dict((v[0], v[1:]) for v in countries) + # Private mappings {name: (ID, code)} + self.__langByName = dict((v[1], (v[0], v[2])) for v in languages) + self.__textByName = dict((v[1], (v[0], v[2])) for v in scripts) + self.__landByName = dict((v[1], (v[0], v[2])) for v in countries) + # Other properties: + self.dupes = set(v[1] for v in languages) & set(v[1] for v in countries) + self.cldrVersion = self.__firstChildText(self.root, "version") + + def loadLocaleMap(self, calendars, grumble = lambda text: None): + kid = self.__firstChildText + likely = dict(self.__likely) + for elt in self.__eachEltInGroup(self.root, 'localeList', 'locale'): + locale = Locale.fromXmlData(lambda k: kid(elt, k), calendars) + language = self.__langByName[locale.language][0] + script = self.__textByName[locale.script][0] + country = self.__landByName[locale.country][0] + + if language != 1: # C + if country == 0: + grumble('loadLocaleMap: No country id for "{}"\n'.format(locale.language)) + + if script == 0: + # Find default script for the given language and country - see: + # http://www.unicode.org/reports/tr35/#Likely_Subtags + try: + try: + to = likely[(locale.language, 'AnyScript', locale.country)] + except KeyError: + to = likely[(locale.language, 'AnyScript', 'AnyCountry')] + except KeyError: + pass + else: + locale.script = to[1] + script = self.__textByName[locale.script][0] + + yield (language, script, country), locale + + def languageIndices(self, locales): + index = 0 + for key, value in self.languages.iteritems(): + i, count = 0, locales.count(key) + if count > 0: + i = index + index += count + yield i, value[0] + + def likelyMap(self): + def tag(t): + lang, script, land = t + yield lang[1] if lang[0] else 'und' + if script[0]: yield script[1] + if land[0]: yield land[1] + + def ids(t): + return tuple(x[0] for x in t) + + for i, pair in enumerate(self.__likely, 1): + have = self.__fromNames(pair[0]) + give = self.__fromNames(pair[1]) + yield ('_'.join(tag(have)), ids(have), + '_'.join(tag(give)), ids(give), + i == len(self.__likely)) + + def defaultMap(self): + """Map language and script to their default country by ID. + + Yields ((language, script), country) wherever the likely + sub-tags mapping says language's default locale uses the given + script and country.""" + for have, give in self.__likely: + if have[1:] == ('AnyScript', 'AnyCountry') and give[2] != 'AnyCountry': + assert have[0] == give[0], (have, give) + yield ((self.__langByName[give[0]][0], + self.__textByName[give[1]][0]), + self.__landByName[give[2]][0]) + + # Implementation details: + def __loadMap(self, category): + kid = self.__firstChildText + for element in self.__eachEltInGroup(self.root, category + 'List', category): + yield int(kid(element, 'id')), kid(element, 'name'), kid(element, 'code') + + def __likelySubtagsMap(self): + def triplet(element, keys=('language', 'script', 'country'), kid = self.__firstChildText): + return tuple(kid(element, key) for key in keys) + + kid = self.__firstChildElt + for elt in self.__eachEltInGroup(self.root, 'likelySubtags', 'likelySubtag'): + yield triplet(kid(elt, "from")), triplet(kid(elt, "to")) + + def __fromNames(self, names): + return self.__langByName[names[0]], self.__textByName[names[1]], self.__landByName[names[2]] + + # DOM access: + from xml.dom import minidom + @staticmethod + def __parse(filename, read = minidom.parse): + return read(filename).documentElement + + @staticmethod + def __isNodeNamed(elt, name, TYPE=minidom.Node.ELEMENT_NODE): + return elt.nodeType == TYPE and elt.nodeName == name + del minidom + + @staticmethod + def __eltWords(elt): + child = elt.firstChild + while child: + if child.nodeType == elt.TEXT_NODE: + yield child.nodeValue + child = child.nextSibling + + @classmethod + def __firstChildElt(cls, parent, name): + child = parent.firstChild + while child: + if cls.__isNodeNamed(child, name): + return child + child = child.nextSibling + + raise Error('No {} child found'.format(name)) + + @classmethod + def __firstChildText(cls, elt, key): + return ' '.join(cls.__eltWords(cls.__firstChildElt(elt, key))) + + @classmethod + def __eachEltInGroup(cls, parent, group, key): + try: + element = cls.__firstChildElt(parent, group).firstChild + except Error: + element = None + + while element: + if cls.__isNodeNamed(element, key): + yield element + element = element.nextSibling + + +class Spacer (object): + def __init__(self, indent = None, initial = ''): + """Prepare to manage indentation and line breaks. + + Arguments are both optional. + + First argument, indent, is either None (its default, for + 'minifying'), an ingeter (number of spaces) or the unit of + text that is to be used for each indentation level (e.g. '\t' + to use tabs). If indent is None, no indentation is added, nor + are line-breaks; otherwise, self(text), for non-empty text, + shall end with a newline and begin with indentation. + + Second argument, initial, is the initial indentation; it is + ignored if indent is None. Indentation increases after each + call to self(text) in which text starts with a tag and doesn't + include its end-tag; indentation decreases if text starts with + an end-tag. The text is not parsed any more carefully than + just described. + """ + if indent is None: + self.__call = lambda x: x + else: + self.__each = ' ' * indent if isinstance(indent, int) else indent + self.current = initial + self.__call = self.__wrap + + def __wrap(self, line): + if not line: + return '\n' + + indent = self.current + if line.startswith('</'): + indent = self.current = indent[:-len(self.__each)] + elif line.startswith('<') and not line.startswith('<!'): + cut = line.find('>') + tag = (line[1:] if cut < 0 else line[1 : cut]).strip().split()[0] + if '</{}>'.format(tag) not in line: + self.current += self.__each + return indent + line + '\n' + + def __call__(self, line): + return self.__call(line) + +class QLocaleXmlWriter (object): + def __init__(self, save = None, space = Spacer(4)): + """Set up to write digested CLDR data as QLocale XML. + + Arguments are both optional. + + First argument, save, is None (its default) or a callable that + will write content to where you intend to save it. If None, it + is replaced with a callable that prints the given content, + suppressing the newline (but see the following); this is + equivalent to passing sys.stdout.write. + + Second argument, space, is an object to call on each text + output to prepend indentation and append newlines, or not as + the case may be. The default is a Spacer(4), which grows + indent by four spaces after each unmatched new tag and shrinks + back on a close-tag (its parsing is naive, but adequate to how + this class uses it), while adding a newline to each line. + """ + self.__rawOutput = self.__printit if save is None else save + self.__wrap = space + self.__write('<localeDatabase>') + + # Output of various sections, in their usual order: + def enumData(self, languages, scripts, countries): + self.__enumTable('languageList', languages) + self.__enumTable('scriptList', scripts) + self.__enumTable('countryList', countries) + + def likelySubTags(self, entries): + self.__openTag('likelySubtags') + for have, give in entries: + self.__openTag('likelySubtag') + self.__likelySubTag('from', have) + self.__likelySubTag('to', give) + self.__closeTag('likelySubtag') + self.__closeTag('likelySubtags') + + def locales(self, locales, calendars): + self.__openTag('localeList') + self.__openTag('locale') + Locale.C(calendars).toXml(self.inTag, calendars) + self.__closeTag('locale') + keys = locales.keys() + keys.sort() + for key in keys: + self.__openTag('locale') + locales[key].toXml(self.inTag, calendars) + self.__closeTag('locale') + self.__closeTag('localeList') + + def version(self, cldrVersion): + self.inTag('version', cldrVersion) + + def inTag(self, tag, text): + self.__write('<{0}>{1}</{0}>'.format(tag, text)) + + def close(self): + if self.__rawOutput != self.__complain: + self.__write('</localeDatabase>') + self.__rawOutput = self.__complain + + # Implementation details + @staticmethod + def __printit(text): + print(text, end='') + @staticmethod + def __complain(text): + raise Error('Attempted to write data after closing :-(') + + def __enumTable(self, tag, table): + self.__openTag(tag) + for key, value in table.iteritems(): + self.__openTag(tag[:-4]) + self.inTag('name', value[0]) + self.inTag('id', key) + self.inTag('code', value[1]) + self.__closeTag(tag[:-4]) + self.__closeTag(tag) + + def __likelySubTag(self, tag, likely): + self.__openTag(tag) + self.inTag('language', likely[0]) + self.inTag('script', likely[1]) + self.inTag('country', likely[2]) + # self.inTag('variant', likely[3]) + self.__closeTag(tag) + + def __openTag(self, tag): + self.__write('<{}>'.format(tag)) + def __closeTag(self, tag): + self.__write('</{}>'.format(tag)) + + def __write(self, line): + self.__rawOutput(self.__wrap(line)) + +class Locale (object): + """Holder for the assorted data representing one locale. + + Implemented as a namespace; its constructor and update() have the + same signatures as those of a dict, acting on the instance's + __dict__, so the results are accessed as attributes rather than + mapping keys.""" + def __init__(self, data=None, **kw): + self.update(data, **kw) + + def update(self, data=None, **kw): + if data: self.__dict__.update(data) + if kw: self.__dict__.update(kw) + + def __len__(self): # Used when testing as a boolean + return len(self.__dict__) + @staticmethod def propsMonthDay(scale, lengths=('long', 'short', 'narrow')): for L in lengths: @@ -158,16 +478,24 @@ class Locale: return cls(data) - def toXml(self, calendars=('gregorian',), indent=' ', tab=' '): - print indent + '<locale>' - inner = indent + tab + def toXml(self, write, calendars=('gregorian',)): + """Writes its data as QLocale XML. + + First argument, write, is a callable taking the name and + content of an XML element; it is expected to be the inTag + bound method of a QLocaleXmlWriter instance. + + Optional second argument is a list of calendar names, in the + form used by CLDR; its default is ('gregorian',). + """ get = lambda k: getattr(self, k) for key in ('language', 'script', 'country'): - print inner + "<%s>" % key + get(key) + "</%s>" % key - print inner + "<%scode>" % key + get(key + '_code') + "</%scode>" % key + write(key, get(key)) + write('{}code'.format(key), get('{}_code'.format(key))) - for key in ('decimal', 'group', 'zero', 'list', 'percent', 'minus', 'plus', 'exp'): - print inner + "<%s>" % key + get(key) + "</%s>" % key + for key in ('decimal', 'group', 'zero', 'list', + 'percent', 'minus', 'plus', 'exp'): + write(key, get(key)) for key in ('languageEndonym', 'countryEndonym', 'quotationStart', 'quotationEnd', @@ -185,16 +513,10 @@ class Locale: '_'.join((k, cal)) for k in self.propsMonthDay('months') for cal in calendars): - print inner + "<%s>%s</%s>" % (key, escape(get(key)).encode('utf-8'), key) + write(key, escape(get(key)).encode('utf-8')) for key in ('currencyDigits', 'currencyRounding'): - print inner + "<%s>%d</%s>" % (key, get(key), key) - - print indent + "</locale>" - - def __init__(self, data=None, **kw): - if data: self.__dict__.update(data) - if kw: self.__dict__.update(kw) + write(key, get(key)) # Tools used by __monthNames: def fullName(i, name): return name @@ -213,6 +535,9 @@ class Locale: @staticmethod def __monthNames(calendars, known={ # Map calendar to (names, extractors...): + # TODO: do we even need these ? CLDR's root.xml seems to + # have them, complete with yeartype="leap" handling for + # Hebrew's extra. 'gregorian': (('January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December'), # Extractor pairs, (plain, standalone) @@ -240,8 +565,8 @@ class Locale: for cal in calendars: try: data = known[cal] - except KeyError: # Need to add an entry to known, above. - print 'Unsupported calendar:', cal + except KeyError as e: # Need to add an entry to known, above. + e.args += ('Unsupported calendar:', cal) raise names, get = data[0], data[1:] for n, size in enumerate(sizes): @@ -253,12 +578,11 @@ class Locale: @classmethod def C(cls, calendars=('gregorian',), - # Empty entry at end to ensure final separator when join()ed: days = ('Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday'), quantifiers=('k', 'M', 'G', 'T', 'P', 'E')): """Returns an object representing the C locale.""" - return cls(dict(cls.__monthNames(calendars)), + return cls(cls.__monthNames(calendars), language='C', language_code='0', languageEndonym='', script='AnyScript', script_code='0', country='AnyCountry', country_code='0', countryEndonym='', diff --git a/util/locale_database/qlocalexml2cpp.py b/util/locale_database/qlocalexml2cpp.py index 3dde298f47..db45ab2778 100755 --- a/util/locale_database/qlocalexml2cpp.py +++ b/util/locale_database/qlocalexml2cpp.py @@ -34,238 +34,53 @@ the root of the qtbase check-out as second parameter. """ import os -import sys -import tempfile import datetime -import xml.dom.minidom -from enumdata import language_aliases, country_aliases, script_aliases -from qlocalexml import Locale +from qlocalexml import QLocaleXmlReader +from xml.dom import minidom +from localetools import unicode2hex, wrap_list, Error, Transcriber, SourceFileEditor -# TODO: Make calendars a command-line parameter -# map { CLDR name: Qt file name } -calendars = {'gregorian': 'roman', 'persian': 'jalali', 'islamic': 'hijri',} # 'hebrew': 'hebrew', - -generated_template = """ -/* - This part of the file was generated on %s from the - Common Locale Data Repository v%s - - http://www.unicode.org/cldr/ - - Do not edit this section: instead regenerate it using - cldr2qlocalexml.py and qlocalexml2cpp.py on updated (or - edited) CLDR data; see qtbase/util/locale_database/. -*/ - -""" - -class Error: - def __init__(self, msg): - self.msg = msg - def __str__(self): - return self.msg - -def wrap_list(lst): - def split(lst, size): - while lst: - head, lst = lst[:size], lst[size:] - yield head - return ",\n".join(", ".join(x) for x in split(lst, 20)) - -def isNodeNamed(elt, name, TYPE=xml.dom.minidom.Node.ELEMENT_NODE): - return elt.nodeType == TYPE and elt.nodeName == name - -def firstChildElt(parent, name): - child = parent.firstChild - while child: - if isNodeNamed(child, name): - return child - child = child.nextSibling +def compareLocaleKeys(key1, key2): + if key1 == key2: + return 0 - raise Error('No %s child found' % name) + if key1[0] != key2[0]: # First sort by language: + return key1[0] - key2[0] -def eachEltInGroup(parent, group, key): + defaults = compareLocaleKeys.default_map + # maps {(language, script): country} by ID try: - element = firstChildElt(parent, group).firstChild - except Error: - element = None - - while element: - if isNodeNamed(element, key): - yield element - element = element.nextSibling - -def eltWords(elt): - child = elt.firstChild - while child: - if child.nodeType == elt.TEXT_NODE: - yield child.nodeValue - child = child.nextSibling - -def firstChildText(elt, key): - return ' '.join(eltWords(firstChildElt(elt, key))) - -def loadMap(doc, category): - return dict((int(firstChildText(element, 'id')), - (firstChildText(element, 'name'), - firstChildText(element, 'code'))) - for element in eachEltInGroup(doc.documentElement, - category + 'List', category)) - -def loadLikelySubtagsMap(doc): - def triplet(element, keys=('language', 'script', 'country')): - return tuple(firstChildText(element, key) for key in keys) - - return dict((i, {'from': triplet(firstChildElt(elt, "from")), - 'to': triplet(firstChildElt(elt, "to"))}) - for i, elt in enumerate(eachEltInGroup(doc.documentElement, - 'likelySubtags', 'likelySubtag'))) - -def fixedScriptName(name, dupes): - # Don't .capitalize() as some names are already camel-case (see enumdata.py): - name = ''.join(word[0].upper() + word[1:] for word in name.split()) - if name[-6:] != "Script": - name = name + "Script" - if name in dupes: - sys.stderr.write("\n\n\nERROR: The script name '%s' is messy" % name) - sys.exit(1) - return name - -def fixedCountryName(name, dupes): - if name in dupes: - return name.replace(" ", "") + "Country" - return name.replace(" ", "") - -def fixedLanguageName(name, dupes): - if name in dupes: - return name.replace(" ", "") + "Language" - return name.replace(" ", "") - -def findDupes(country_map, language_map): - country_set = set(v[0] for a, v in country_map.iteritems()) - language_set = set(v[0] for a, v in language_map.iteritems()) - return country_set & language_set - -def languageNameToId(name, language_map): - for key in language_map.keys(): - if language_map[key][0] == name: - return key - return -1 - -def scriptNameToId(name, script_map): - for key in script_map.keys(): - if script_map[key][0] == name: - return key - return -1 - -def countryNameToId(name, country_map): - for key in country_map.keys(): - if country_map[key][0] == name: - return key - return -1 - -def loadLocaleMap(doc, language_map, script_map, country_map, likely_subtags_map): - result = {} - - for locale_elt in eachEltInGroup(doc.documentElement, "localeList", "locale"): - locale = Locale.fromXmlData(lambda k: firstChildText(locale_elt, k), calendars.keys()) - language_id = languageNameToId(locale.language, language_map) - if language_id == -1: - sys.stderr.write("Cannot find a language id for '%s'\n" % locale.language) - script_id = scriptNameToId(locale.script, script_map) - if script_id == -1: - sys.stderr.write("Cannot find a script id for '%s'\n" % locale.script) - country_id = countryNameToId(locale.country, country_map) - if country_id == -1: - sys.stderr.write("Cannot find a country id for '%s'\n" % locale.country) - - if language_id != 1: # C - if country_id == 0: - sys.stderr.write("loadLocaleMap: No country id for '%s'\n" % locale.language) - - if script_id == 0: - # find default script for a given language and country (see http://www.unicode.org/reports/tr35/#Likely_Subtags) - for key in likely_subtags_map.keys(): - tmp = likely_subtags_map[key] - if tmp["from"][0] == locale.language and tmp["from"][1] == "AnyScript" and tmp["from"][2] == locale.country: - locale.script = tmp["to"][1] - script_id = scriptNameToId(locale.script, script_map) - break - if script_id == 0 and country_id != 0: - # try with no country - for key in likely_subtags_map.keys(): - tmp = likely_subtags_map[key] - if tmp["from"][0] == locale.language and tmp["from"][1] == "AnyScript" and tmp["from"][2] == "AnyCountry": - locale.script = tmp["to"][1] - script_id = scriptNameToId(locale.script, script_map) - break - - result[(language_id, script_id, country_id)] = locale - - return result + country = defaults[key1[:2]] + except KeyError: + pass + else: + if key1[2] == country: + return -1 + if key2[2] == country: + return 1 -def compareLocaleKeys(key1, key2): - if key1 == key2: - return 0 + if key1[1] == key2[1]: + return key1[2] - key2[2] - if key1[0] == key2[0]: - l1 = compareLocaleKeys.locale_map[key1] - l2 = compareLocaleKeys.locale_map[key2] - - if (l1.language, l1.script) in compareLocaleKeys.default_map.keys(): - default = compareLocaleKeys.default_map[(l1.language, l1.script)] - if l1.country == default: - return -1 - if l2.country == default: - return 1 - - if key1[1] != key2[1]: - if (l2.language, l2.script) in compareLocaleKeys.default_map.keys(): - default = compareLocaleKeys.default_map[(l2.language, l2.script)] - if l2.country == default: - return 1 - if l1.country == default: - return -1 - - if key1[1] != key2[1]: - return key1[1] - key2[1] + try: + country = defaults[key2[:2]] + except KeyError: + pass else: - return key1[0] - key2[0] + if key2[2] == country: + return 1 + if key1[2] == country: + return -1 + + return key1[1] - key2[1] - return key1[2] - key2[2] - - -def languageCount(language_id, locale_map): - result = 0 - for key in locale_map.keys(): - if key[0] == language_id: - result += 1 - return result - -def unicode2hex(s): - lst = [] - for x in s: - v = ord(x) - if v > 0xFFFF: - # make a surrogate pair - # copied from qchar.h - high = (v >> 10) + 0xd7c0 - low = (v % 0x400 + 0xdc00) - lst.append(hex(high)) - lst.append(hex(low)) - else: - lst.append(hex(v)) - return lst class StringDataToken: def __init__(self, index, length, bits): if index > 0xffff: - print "\n\n\n#error Data index is too big!", index - raise ValueError("Start-index (%d) exceeds the uint16 range!" % index) + raise ValueError('Start-index ({}) exceeds the uint16 range!'.format(index)) if length >= (1 << bits): - print "\n\n\n#error Range length is too big!", length - raise ValueError("Data size (%d) exceeds the %d-bit range!" % (length, bits)) + raise ValueError('Data size ({}) exceeds the {}-bit range!'.format(length, bits)) self.index = index self.length = length @@ -277,7 +92,7 @@ class StringData: self.name = name self.text = '' # Used in quick-search for matches in data - def append(self, s, bits=8): + def append(self, s, bits = 8): try: token = self.hash[s] except KeyError: @@ -317,592 +132,481 @@ class StringData: def write(self, fd): if len(self.data) > 0xffff: - raise ValueError("Data is too big for quint16 index to its end!" % len(self.data), + raise ValueError('Data is too big ({}) for quint16 index to its end!' + .format(len(self.data)), self.name) - fd.write("\nstatic const char16_t %s[] = {\n" % self.name) + fd.write("\nstatic const char16_t {}[] = {{\n".format(self.name)) fd.write(wrap_list(self.data)) fd.write("\n};\n") -def escapedString(s): - result = "" - i = 0 - while i < len(s): - if s[i] == '"': - result += '\\"' - i += 1 - else: - result += s[i] - i += 1 - s = result - - line = "" - need_escape = False - result = "" - for c in s: - if ord(c) < 128 and not (need_escape and ord('a') <= ord(c.lower()) <= ord('f')): - line += c - need_escape = False - else: - line += "\\x%02x" % (ord(c)) - need_escape = True - if len(line) > 80: - result = result + "\n" + '"' + line + '"' - line = "" - line += "\\0" - result = result + "\n" + '"' + line + '"' - if result[0] == "\n": - result = result[1:] - return result - -def printEscapedString(s): - print escapedString(s) - def currencyIsoCodeData(s): if s: return '{' + ",".join(str(ord(x)) for x in s) + '}' return "{0,0,0}" -def usage(): - print "Usage: qlocalexml2cpp.py <path-to-locale.xml> <path-to-qtbase-src-tree>" - sys.exit(1) +class LocaleSourceEditor (SourceFileEditor): + __upinit = SourceFileEditor.__init__ + def __init__(self, path, temp, version): + self.__upinit(path, temp) + self.writer.write(""" +/* + This part of the file was generated on {} from the + Common Locale Data Repository v{} + + http://www.unicode.org/cldr/ + + Do not edit this section: instead regenerate it using + cldr2qlocalexml.py and qlocalexml2cpp.py on updated (or + edited) CLDR data; see qtbase/util/locale_database/. +*/ -GENERATED_BLOCK_START = "// GENERATED PART STARTS HERE\n" -GENERATED_BLOCK_END = "// GENERATED PART ENDS HERE\n" +""".format(datetime.date.today(), version)) + +class LocaleDataWriter (LocaleSourceEditor): + def likelySubtags(self, likely): + self.writer.write('static const QLocaleId likely_subtags[] = {\n') + for had, have, got, give, last in likely: + self.writer.write(' {{ {:3d}, {:3d}, {:3d} }}'.format(*have)) + self.writer.write(', {{ {:3d}, {:3d}, {:3d} }}'.format(*give)) + self.writer.write(' ' if last else ',') + self.writer.write(' // {} -> {}\n'.format(had, got)) + self.writer.write('};\n\n') + + def localeIndex(self, indices): + self.writer.write('static const quint16 locale_index[] = {\n') + for pair in indices: + self.writer.write('{:6d}, // {}\n'.format(*pair)) + self.writer.write(' 0 // trailing 0\n') + self.writer.write('};\n\n') + + def localeData(self, locales, names): + list_pattern_part_data = StringData('list_pattern_part_data') + single_character_data = StringData('single_character_data') + date_format_data = StringData('date_format_data') + time_format_data = StringData('time_format_data') + days_data = StringData('days_data') + am_data = StringData('am_data') + pm_data = StringData('pm_data') + byte_unit_data = StringData('byte_unit_data') + currency_symbol_data = StringData('currency_symbol_data') + currency_display_name_data = StringData('currency_display_name_data') + currency_format_data = StringData('currency_format_data') + endonyms_data = StringData('endonyms_data') + + # Locale data + self.writer.write('static const QLocaleData locale_data[] = {\n') + # Table headings: keep each label centred in its field, matching line_format: + self.writer.write(' // ' + # Width 6 + comma + ' lang ' # IDs + 'script ' + ' terr ' + + # Range entries (all start-indices, then all sizes) + # Width 5 + comma + 'lStrt ' # List pattern + 'lpMid ' + 'lpEnd ' + 'lPair ' + 'lDelm ' # List delimiter + # Representing numbers + ' dec ' + 'group ' + 'prcnt ' + ' zero ' + 'minus ' + 'plus ' + ' exp ' + # Quotation marks + 'qtOpn ' + 'qtEnd ' + 'altQO ' + 'altQE ' + 'lDFmt ' # Date format + 'sDFmt ' + 'lTFmt ' # Time format + 'sTFmt ' + 'slDay ' # Day names + 'lDays ' + 'ssDys ' + 'sDays ' + 'snDay ' + 'nDays ' + ' am ' # am/pm indicators + ' pm ' + ' byte ' + 'siQnt ' + 'iecQn ' + 'crSym ' # Currency formatting + 'crDsp ' + 'crFmt ' + 'crFNg ' + 'ntLng ' # Name of language in itself, and of territory + 'ntTer ' + # Width 3 + comma for each size; no header + + ' ' * 37 + + + # Strays (char array, bit-fields): + # Width 10 + 2 spaces + comma + ' currISO ' + # Width 6 + comma + 'curDgt ' # Currency digits + 'curRnd ' # Currencty rounding (unused: QTBUG-81343) + 'dow1st ' # First day of week + ' wknd+ ' # Week-end start/end days + ' wknd-' + # No trailing space on last entry (be sure to + # pad before adding anything after it). + '\n') + + formatLine = ''.join(( + ' {{ ', + # Locale-identifier + '{:6d},' * 3, + # List patterns, date/time formats, day names, am/pm + # SI/IEC byte-unit abbreviations + # Currency and endonyms + # Range starts + '{:5d},' * 37, + # Range sizes + '{:3d},' * 37, + + # Currency ISO code + ' {:>10s}, ', + # Currency formatting + '{:6d},{:6d}', + # Day of week and week-end + ',{:6d}' * 3, + ' }}')).format + for key in names: + locale = locales[key] + # Sequence of StringDataToken: + ranges = (tuple(list_pattern_part_data.append(p) for p in # 5 entries: + (locale.listPatternPartStart, locale.listPatternPartMiddle, + locale.listPatternPartEnd, locale.listPatternPartTwo, + locale.listDelim)) + + tuple(single_character_data.append(p) for p in # 11 entries + (locale.decimal, locale.group, locale.percent, locale.zero, + locale.minus, locale.plus, locale.exp, + locale.quotationStart, locale.quotationEnd, + locale.alternateQuotationStart, locale.alternateQuotationEnd)) + + tuple (date_format_data.append(f) for f in # 2 entries: + (locale.longDateFormat, locale.shortDateFormat)) + + tuple(time_format_data.append(f) for f in # 2 entries: + (locale.longTimeFormat, locale.shortTimeFormat)) + + tuple(days_data.append(d) for d in # 6 entries: + (locale.standaloneLongDays, locale.longDays, + locale.standaloneShortDays, locale.shortDays, + locale.standaloneNarrowDays, locale.narrowDays)) + + (am_data.append(locale.am), pm_data.append(locale.pm)) + # 2 entries + tuple(byte_unit_data.append(b) for b in # 3 entries: + (locale.byte_unit, + locale.byte_si_quantified, + locale.byte_iec_quantified)) + + (currency_symbol_data.append(locale.currencySymbol), + currency_display_name_data.append(locale.currencyDisplayName), + currency_format_data.append(locale.currencyFormat), + currency_format_data.append(locale.currencyNegativeFormat), + endonyms_data.append(locale.languageEndonym), + endonyms_data.append(locale.countryEndonym)) # 6 entries + ) # Total: 37 entries + assert len(ranges) == 37 + + self.writer.write(formatLine(*( + key + + tuple(r.index for r in ranges) + + tuple(r.length for r in ranges) + + (currencyIsoCodeData(locale.currencyIsoCode), + locale.currencyDigits, + locale.currencyRounding, # unused (QTBUG-81343) + locale.firstDayOfWeek, + locale.weekendStart, + locale.weekendEnd) )) + + ', // {}/{}/{}\n'.format( + locale.language, locale.script, locale.country)) + self.writer.write(formatLine(*( # All zeros, matching the format: + (0,) * 3 + (0,) * 37 * 2 + + (currencyIsoCodeData(0),) + + (0,) * 2 + + (0,) * 3 )) + + ' // trailing zeros\n') + self.writer.write('};\n') + + # StringData tables: + for data in (list_pattern_part_data, single_character_data, + date_format_data, time_format_data, days_data, + byte_unit_data, am_data, pm_data, currency_symbol_data, + currency_display_name_data, currency_format_data, + endonyms_data): + data.write(self.writer) + + @staticmethod + def __writeNameData(out, book, form): + out('static const char {}_name_list[] =\n'.format(form)) + out('"Default\\0"\n') + for key, value in book.items(): + if key == 0: + continue + out('"' + value[0] + '\\0"\n') + out(';\n\n') + + out('static const quint16 {}_name_index[] = {{\n'.format(form)) + out(' 0, // Any{}\n'.format(form.capitalize())) + index = 8 + for key, value in book.items(): + if key == 0: + continue + name = value[0] + out('{:6d}, // {}\n'.format(index, name)) + index += len(name) + 1 + out('};\n\n') + + @staticmethod + def __writeCodeList(out, book, form, width): + out('static const unsigned char {}_code_list[] =\n'.format(form)) + for key, value in book.items(): + code = value[1] + code += r'\0' * max(width - len(code), 0) + out('"{}" // {}\n'.format(code, value[0])) + out(';\n\n') + + def languageNames(self, languages): + self.__writeNameData(self.writer.write, languages, 'language') + + def scriptNames(self, scripts): + self.__writeNameData(self.writer.write, scripts, 'script') + + def countryNames(self, countries): + self.__writeNameData(self.writer.write, countries, 'country') + + # TODO: unify these next three into the previous three; kept + # separate for now to verify we're not changing data. + + def languageCodes(self, languages): + self.__writeCodeList(self.writer.write, languages, 'language', 3) + + def scriptCodes(self, scripts): + self.__writeCodeList(self.writer.write, scripts, 'script', 4) + + def countryCodes(self, countries): # TODO: unify with countryNames() + self.__writeCodeList(self.writer.write, countries, 'country', 3) + +class CalendarDataWriter (LocaleSourceEditor): + formatCalendar = ( + ' {{' + + ','.join(('{:6d}',) * 3 + ('{:5d}',) * 6 + ('{:3d}',) * 6) + + ' }},').format + def write(self, calendar, locales, names): + months_data = StringData('months_data') -def main(): - if len(sys.argv) != 3: - usage() + self.writer.write('static const QCalendarLocale locale_data[] = {\n') + self.writer.write( + ' //' + # IDs, width 7 (6 + comma) + ' lang ' + ' script' + ' terr ' + # Month-name start-indices, width 6 (5 + comma) + 'sLong ' + ' long ' + 'sShrt ' + 'short ' + 'sNarw ' + 'narow ' + # No individual headers for the sizes. + 'Sizes...' + '\n') + for key in names: + locale = locales[key] + # Sequence of StringDataToken: + try: + # Twelve long month names can add up to more than 256 (e.g. kde_TZ: 264) + ranges = (tuple(months_data.append(m[calendar], 16) for m in + (locale.standaloneLongMonths, locale.longMonths)) + + tuple(months_data.append(m[calendar]) for m in + (locale.standaloneShortMonths, locale.shortMonths, + locale.standaloneNarrowMonths, locale.narrowMonths))) + except ValueError as e: + e.args += (locale.language, locale.script, locale.country, stem) + raise - qlocalexml = sys.argv[1] - qtsrcdir = sys.argv[2] + self.writer.write( + self.formatCalendar(*( + key + + tuple(r.index for r in ranges) + + tuple(r.length for r in ranges) )) + + '// {}/{}/{}\n'.format(locale.language, locale.script, locale.country)) + self.writer.write(self.formatCalendar(*( (0,) * (3 + 6 * 2) )) + + '// trailing zeros\n') + self.writer.write('};\n') + months_data.write(self.writer) + +class LocaleHeaderWriter (SourceFileEditor): + __upinit = SourceFileEditor.__init__ + def __init__(self, path, temp, dupes): + self.__upinit(path, temp) + self.__dupes = dupes + + def languages(self, languages): + self.__enum('Language', languages, self.__language) + self.writer.write('\n') + + def countries(self, countries): + self.__enum('Country', countries, self.__country) + + def scripts(self, scripts): + self.__enum('Script', scripts, self.__script) + self.writer.write('\n') + + # Implementation details + from enumdata import (language_aliases as __language, + country_aliases as __country, + script_aliases as __script) + + def __enum(self, name, book, alias): + assert book + out, dupes = self.writer.write, self.__dupes + out(' enum {} {{\n'.format(name)) + for key, value in book.items(): + member = value[0] + if name == 'Script': + # Don't .capitalize() as some names are already camel-case (see enumdata.py): + member = ''.join(word[0].upper() + word[1:] for word in member.split()) + if not member.endswith('Script'): + member += 'Script' + if member in dupes: + raise Error('The script name "{}" is messy'.format(member)) + else: + member = ''.join(member.split()) + member = member + name if member in dupes else member + out(' {} = {},\n'.format(member, key)) + + out('\n ' + + ',\n '.join('{} = {}'.format(*pair) + for pair in sorted(alias.items())) + + ',\n\n Last{} = {}\n }};\n'.format(name, member)) + +def usage(name, err, message = ''): + err.write("""Usage: {} path/to/qlocale.xml root/of/qtbase +""".format(name)) # TODO: elaborate + if message: + err.write('\n' + message + '\n') + +def main(args, out, err): + # TODO: Make calendars a command-line parameter + # map { CLDR name: Qt file name } + calendars = {'gregorian': 'roman', 'persian': 'jalali', 'islamic': 'hijri',} # 'hebrew': 'hebrew', + + name = args.pop(0) + if len(args) != 2: + usage(name, err, 'I expect two arguments') + return 1 + + qlocalexml = args.pop(0) + qtsrcdir = args.pop(0) if not (os.path.isdir(qtsrcdir) and all(os.path.isfile(os.path.join(qtsrcdir, 'src', 'corelib', 'text', leaf)) for leaf in ('qlocale_data_p.h', 'qlocale.h', 'qlocale.qdoc'))): - usage() - - (data_temp_file, data_temp_file_path) = tempfile.mkstemp("qlocale_data_p.h", dir=qtsrcdir) - data_temp_file = os.fdopen(data_temp_file, "w") - qlocaledata_file = open(qtsrcdir + "/src/corelib/text/qlocale_data_p.h", "r") - s = qlocaledata_file.readline() - while s and s != GENERATED_BLOCK_START: - data_temp_file.write(s) - s = qlocaledata_file.readline() - data_temp_file.write(GENERATED_BLOCK_START) - - doc = xml.dom.minidom.parse(qlocalexml) - language_map = loadMap(doc, 'language') - script_map = loadMap(doc, 'script') - country_map = loadMap(doc, 'country') - likely_subtags_map = loadLikelySubtagsMap(doc) - default_map = {} - for key in likely_subtags_map.keys(): - tmp = likely_subtags_map[key] - if tmp["from"][1] == "AnyScript" and tmp["from"][2] == "AnyCountry" and tmp["to"][2] != "AnyCountry": - default_map[(tmp["to"][0], tmp["to"][1])] = tmp["to"][2] - locale_map = loadLocaleMap(doc, language_map, script_map, country_map, likely_subtags_map) - dupes = findDupes(language_map, country_map) - - cldr_version = firstChildText(doc.documentElement, "version") - data_temp_file.write(generated_template % (datetime.date.today(), cldr_version)) - - # Likely subtags map - data_temp_file.write("static const QLocaleId likely_subtags[] = {\n") - index = 0 - for key in likely_subtags_map.keys(): - tmp = likely_subtags_map[key] - from_language = languageNameToId(tmp["from"][0], language_map) - from_script = scriptNameToId(tmp["from"][1], script_map) - from_country = countryNameToId(tmp["from"][2], country_map) - to_language = languageNameToId(tmp["to"][0], language_map) - to_script = scriptNameToId(tmp["to"][1], script_map) - to_country = countryNameToId(tmp["to"][2], country_map) - - cmnt_from = "" - if from_language != 0: - cmnt_from = cmnt_from + language_map[from_language][1] - else: - cmnt_from = cmnt_from + "und" - if from_script != 0: - if cmnt_from: - cmnt_from = cmnt_from + "_" - cmnt_from = cmnt_from + script_map[from_script][1] - if from_country != 0: - if cmnt_from: - cmnt_from = cmnt_from + "_" - cmnt_from = cmnt_from + country_map[from_country][1] - cmnt_to = "" - if to_language != 0: - cmnt_to = cmnt_to + language_map[to_language][1] - else: - cmnt_to = cmnt_to + "und" - if to_script != 0: - if cmnt_to: - cmnt_to = cmnt_to + "_" - cmnt_to = cmnt_to + script_map[to_script][1] - if to_country != 0: - if cmnt_to: - cmnt_to = cmnt_to + "_" - cmnt_to = cmnt_to + country_map[to_country][1] - - data_temp_file.write(" ") - data_temp_file.write("{ %3d, %3d, %3d }, { %3d, %3d, %3d }" % - (from_language, from_script, from_country, to_language, to_script, to_country)) - index += 1 - if index != len(likely_subtags_map): - data_temp_file.write(",") - else: - data_temp_file.write(" ") - data_temp_file.write(" // %s -> %s\n" % (cmnt_from, cmnt_to)) - data_temp_file.write("};\n") - - data_temp_file.write("\n") - - # Locale index - data_temp_file.write("static const quint16 locale_index[] = {\n") - index = 0 - for key in language_map.keys(): - i = 0 - count = languageCount(key, locale_map) - if count > 0: - i = index - index += count - data_temp_file.write("%6d, // %s\n" % (i, language_map[key][0])) - data_temp_file.write(" 0 // trailing 0\n") - data_temp_file.write("};\n\n") - - list_pattern_part_data = StringData('list_pattern_part_data') - single_character_data = StringData('single_character_data') - date_format_data = StringData('date_format_data') - time_format_data = StringData('time_format_data') - days_data = StringData('days_data') - am_data = StringData('am_data') - pm_data = StringData('pm_data') - byte_unit_data = StringData('byte_unit_data') - currency_symbol_data = StringData('currency_symbol_data') - currency_display_name_data = StringData('currency_display_name_data') - currency_format_data = StringData('currency_format_data') - endonyms_data = StringData('endonyms_data') - - # Locale data - data_temp_file.write("static const QLocaleData locale_data[] = {\n") - # Table headings: keep each label centred in its field, matching line_format: - data_temp_file.write(' // ' - # Width 6 + comma: - + ' lang ' # IDs - + 'script ' - + ' terr ' - - # Range entries (all start-indices, then all sizes): - # Width 5 + comma: - + 'lStrt ' # List pattern - + 'lpMid ' - + 'lpEnd ' - + 'lPair ' - + 'lDelm ' # List delimiter - # Representing numbers: - + ' dec ' - + 'group ' - + 'prcnt ' - + ' zero ' - + 'minus ' - + 'plus ' - + ' exp ' - # Quotation marks - + 'qtOpn ' - + 'qtEnd ' - + 'altQO ' - + 'altQE ' - + 'lDFmt ' # Date format - + 'sDFmt ' - + 'lTFmt ' # Time format - + 'sTFmt ' - + 'slDay ' # Day names - + 'lDays ' - + 'ssDys ' - + 'sDays ' - + 'snDay ' - + 'nDays ' - + ' am ' # am/pm indicators - + ' pm ' - + ' byte ' - + 'siQnt ' - + 'iecQn ' - + 'crSym ' # Currency formatting: - + 'crDsp ' - + 'crFmt ' - + 'crFNg ' - + 'ntLng ' # Name of language in itself, and of territory: - + 'ntTer ' - # Width 3 + comma for each size; no header - + ' ' * 37 - - # Strays (char array, bit-fields): - # Width 8+4 + comma - + ' currISO ' - # Width 6 + comma: - + 'curDgt ' # Currency digits - + 'curRnd ' # Currencty rounding (unused: QTBUG-81343) - + 'dow1st ' # First day of week - + ' wknd+ ' # Week-end start/end days: - + ' wknd-' - # No trailing space on last entry (be sure to - # pad before adding anything after it). - + '\n') + usage(name, err, 'Missing expected files under qtbase source root ' + qtsrcdir) + return 1 + + reader = QLocaleXmlReader(qlocalexml) + locale_map = dict(reader.loadLocaleMap(calendars, err.write)) locale_keys = locale_map.keys() - compareLocaleKeys.default_map = default_map - compareLocaleKeys.locale_map = locale_map + compareLocaleKeys.default_map = dict(reader.defaultMap()) locale_keys.sort(compareLocaleKeys) - line_format = (' { ' - # Locale-identifier: - + '%6d,' * 3 - # Offsets for starts of ranges: - + '%5d,' * 37 - # Sizes for the same: - + '%3d,' * 37 - - # Currency ISO code: - + ' %10s, ' - # Currency formatting: - + '%6d,%6d' - # Day of week and week-end: - + ',%6d' * 3 - + ' }') - for key in locale_keys: - l = locale_map[key] - # Sequence of StringDataToken: - ranges = (tuple(list_pattern_part_data.append(p) for p in # 5 entries: - (l.listPatternPartStart, l.listPatternPartMiddle, - l.listPatternPartEnd, l.listPatternPartTwo, l.listDelim)) + - tuple(single_character_data.append(p) for p in # 11 entries - (l.decimal, l.group, l.percent, l.zero, l.minus, l.plus, l.exp, - l.quotationStart, l.quotationEnd, - l.alternateQuotationStart, l.alternateQuotationEnd)) + - tuple (date_format_data.append(f) for f in # 2 entries: - (l.longDateFormat, l.shortDateFormat)) + - tuple(time_format_data.append(f) for f in # 2 entries: - (l.longTimeFormat, l.shortTimeFormat)) + - tuple(days_data.append(d) for d in # 6 entries: - (l.standaloneLongDays, l.longDays, - l.standaloneShortDays, l.shortDays, - l.standaloneNarrowDays, l.narrowDays)) + - (am_data.append(l.am), pm_data.append(l.pm)) + # 2 entries: - tuple(byte_unit_data.append(b) for b in # 3 entries: - (l.byte_unit, l.byte_si_quantified, l.byte_iec_quantified)) + - (currency_symbol_data.append(l.currencySymbol), - currency_display_name_data.append(l.currencyDisplayName), - currency_format_data.append(l.currencyFormat), - currency_format_data.append(l.currencyNegativeFormat), - endonyms_data.append(l.languageEndonym), - endonyms_data.append(l.countryEndonym)) # 6 entries - ) # Total: 37 entries - assert len(ranges) == 37 - - data_temp_file.write(line_format - % ((key[0], key[1], key[2]) + - tuple(r.index for r in ranges) + - tuple(r.length for r in ranges) + - (currencyIsoCodeData(l.currencyIsoCode), - l.currencyDigits, - l.currencyRounding, # unused (QTBUG-81343) - l.firstDayOfWeek, - l.weekendStart, - l.weekendEnd)) - + ", // %s/%s/%s\n" % (l.language, l.script, l.country)) - data_temp_file.write(line_format # All zeros, matching the format: - % ( (0,) * 3 + (0,) * 37 * 2 - + (currencyIsoCodeData(0),) - + (0,) * 2 - + (0,) * 3) - + " // trailing zeros\n") - data_temp_file.write("};\n") - - # StringData tables: - for data in (list_pattern_part_data, single_character_data, - date_format_data, time_format_data, days_data, - byte_unit_data, am_data, pm_data, currency_symbol_data, - currency_display_name_data, currency_format_data, - endonyms_data): - data.write(data_temp_file) - - data_temp_file.write("\n") - - # Language name list - data_temp_file.write("static const char language_name_list[] =\n") - data_temp_file.write('"Default\\0"\n') - for key in language_map.keys(): - if key == 0: - continue - data_temp_file.write('"' + language_map[key][0] + '\\0"\n') - data_temp_file.write(";\n") - - data_temp_file.write("\n") - - # Language name index - data_temp_file.write("static const quint16 language_name_index[] = {\n") - data_temp_file.write(" 0, // AnyLanguage\n") - index = 8 - for key in language_map.keys(): - if key == 0: - continue - language = language_map[key][0] - data_temp_file.write("%6d, // %s\n" % (index, language)) - index += len(language) + 1 - data_temp_file.write("};\n") - - data_temp_file.write("\n") - - # Script name list - data_temp_file.write("static const char script_name_list[] =\n") - data_temp_file.write('"Default\\0"\n') - for key in script_map.keys(): - if key == 0: - continue - data_temp_file.write('"' + script_map[key][0] + '\\0"\n') - data_temp_file.write(";\n") - - data_temp_file.write("\n") - - # Script name index - data_temp_file.write("static const quint16 script_name_index[] = {\n") - data_temp_file.write(" 0, // AnyScript\n") - index = 8 - for key in script_map.keys(): - if key == 0: - continue - script = script_map[key][0] - data_temp_file.write("%6d, // %s\n" % (index, script)) - index += len(script) + 1 - data_temp_file.write("};\n") - - data_temp_file.write("\n") - - # Country name list - data_temp_file.write("static const char country_name_list[] =\n") - data_temp_file.write('"Default\\0"\n') - for key in country_map.keys(): - if key == 0: - continue - data_temp_file.write('"' + country_map[key][0] + '\\0"\n') - data_temp_file.write(";\n") - - data_temp_file.write("\n") - - # Country name index - data_temp_file.write("static const quint16 country_name_index[] = {\n") - data_temp_file.write(" 0, // AnyCountry\n") - index = 8 - for key in country_map.keys(): - if key == 0: - continue - country = country_map[key][0] - data_temp_file.write("%6d, // %s\n" % (index, country)) - index += len(country) + 1 - data_temp_file.write("};\n") - - data_temp_file.write("\n") - - # Language code list - data_temp_file.write("static const unsigned char language_code_list[] =\n") - for key in language_map.keys(): - code = language_map[key][1] - if len(code) == 2: - code += r"\0" - data_temp_file.write('"%2s" // %s\n' % (code, language_map[key][0])) - data_temp_file.write(";\n") - - data_temp_file.write("\n") - - # Script code list - data_temp_file.write("static const unsigned char script_code_list[] =\n") - for key in script_map.keys(): - code = script_map[key][1] - for i in range(4 - len(code)): - code += "\\0" - data_temp_file.write('"%2s" // %s\n' % (code, script_map[key][0])) - data_temp_file.write(";\n") - - # Country code list - data_temp_file.write("static const unsigned char country_code_list[] =\n") - for key in country_map.keys(): - code = country_map[key][1] - if len(code) == 2: - code += "\\0" - data_temp_file.write('"%2s" // %s\n' % (code, country_map[key][0])) - data_temp_file.write(";\n") - - data_temp_file.write("\n") - data_temp_file.write(GENERATED_BLOCK_END) - s = qlocaledata_file.readline() - # skip until end of the old block - while s and s != GENERATED_BLOCK_END: - s = qlocaledata_file.readline() - - s = qlocaledata_file.readline() - while s: - data_temp_file.write(s) - s = qlocaledata_file.readline() - data_temp_file.close() - qlocaledata_file.close() - - os.remove(qtsrcdir + "/src/corelib/text/qlocale_data_p.h") - os.rename(data_temp_file_path, qtsrcdir + "/src/corelib/text/qlocale_data_p.h") + try: + writer = LocaleDataWriter(os.path.join(qtsrcdir, 'src', 'corelib', 'text', + 'qlocale_data_p.h'), + qtsrcdir, reader.cldrVersion) + except IOError as e: + err.write('Failed to open files to transcribe locale data: ' + (e.message or e.args[1])) + return 1 + + try: + writer.likelySubtags(reader.likelyMap()) + writer.localeIndex(reader.languageIndices(tuple(k[0] for k in locale_map))) + writer.localeData(locale_map, locale_keys) + writer.writer.write('\n') + writer.languageNames(reader.languages) + writer.scriptNames(reader.scripts) + writer.countryNames(reader.countries) + # TODO: merge the next three into the previous three + writer.languageCodes(reader.languages) + writer.scriptCodes(reader.scripts) + writer.countryCodes(reader.countries) + except Error as e: + writer.cleanup() + err.write('\nError updating locale data: ' + e.message + '\n') + return 1 + + writer.close() # Generate calendar data - calendar_format = ' {%6d,%6d,%6d' + ',%5d' * 6 + ',%3d' * 6 + ' },' for calendar, stem in calendars.items(): - months_data = StringData('months_data') - calendar_data_file = "q%scalendar_data_p.h" % stem - calendar_template_file = open(os.path.join(qtsrcdir, 'src', 'corelib', 'time', - calendar_data_file), "r") - (calendar_temp_file, calendar_temp_file_path) = tempfile.mkstemp(calendar_data_file, dir=qtsrcdir) - calendar_temp_file = os.fdopen(calendar_temp_file, "w") - s = calendar_template_file.readline() - while s and s != GENERATED_BLOCK_START: - calendar_temp_file.write(s) - s = calendar_template_file.readline() - calendar_temp_file.write(GENERATED_BLOCK_START) - calendar_temp_file.write(generated_template % (datetime.date.today(), cldr_version)) - calendar_temp_file.write("static const QCalendarLocale locale_data[] = {\n") - calendar_temp_file.write(' // ' - # IDs, width 7 (6 + comma) - + ' lang ' - + ' script' - + ' terr ' - # Month-name start-indices, width 6 (5 + comma): - + 'sLng ' - + 'long ' - + 'sSrt ' - + 'shrt ' - + 'sNrw ' - + 'naro ' - # No individual headers for the sizes. - + 'Sizes...' - + '\n') - for key in locale_keys: - l = locale_map[key] - # Sequence of StringDataToken: - try: - # Twelve long month names can add up to more than 256 (e.g. kde_TZ: 264) - ranges = (tuple(months_data.append(m[calendar], 16) for m in - (l.standaloneLongMonths, l.longMonths)) + - tuple(months_data.append(m[calendar]) for m in - (l.standaloneShortMonths, l.shortMonths, - l.standaloneNarrowMonths, l.narrowMonths))) - except ValueError as e: - e.args += (l.language, l.script, l.country, stem) - raise + try: + writer = CalendarDataWriter(os.path.join(qtsrcdir, 'src', 'corelib', 'time', + 'q{}calendar_data_p.h'.format(stem)), + qtsrcdir, reader.cldrVersion) + except IOError as e: + err.write('Failed to open files to transcribe ' + calendar + + ' data ' + (e.message or e.args[1])) + return 1 + + try: + writer.write(calendar, locale_map, locale_keys) + except Error as e: + writer.cleanup() + err.write('\nError updating ' + calendar + ' locale data: ' + e.message + '\n') + return 1 - calendar_temp_file.write( - calendar_format - % ((key[0], key[1], key[2]) + - tuple(r.index for r in ranges) + - tuple(r.length for r in ranges)) - + "// %s/%s/%s\n" % (l.language, l.script, l.country)) - calendar_temp_file.write(calendar_format % ( (0,) * (3 + 6 * 2) ) - + '// trailing zeros\n') - calendar_temp_file.write("};\n") - months_data.write(calendar_temp_file) - s = calendar_template_file.readline() - while s and s != GENERATED_BLOCK_END: - s = calendar_template_file.readline() - while s: - calendar_temp_file.write(s) - s = calendar_template_file.readline() - os.rename(calendar_temp_file_path, - os.path.join(qtsrcdir, 'src', 'corelib', 'time', calendar_data_file)) + writer.close() # qlocale.h + try: + writer = LocaleHeaderWriter(os.path.join(qtsrcdir, 'src', 'corelib', 'text', 'qlocale.h'), + qtsrcdir, reader.dupes) + except IOError as e: + err.write('Failed to open files to transcribe qlocale.h: ' + (e.message or e.args[1])) + return 1 - (qlocaleh_temp_file, qlocaleh_temp_file_path) = tempfile.mkstemp("qlocale.h", dir=qtsrcdir) - qlocaleh_temp_file = os.fdopen(qlocaleh_temp_file, "w") - qlocaleh_file = open(qtsrcdir + "/src/corelib/text/qlocale.h", "r") - s = qlocaleh_file.readline() - while s and s != GENERATED_BLOCK_START: - qlocaleh_temp_file.write(s) - s = qlocaleh_file.readline() - qlocaleh_temp_file.write(GENERATED_BLOCK_START) - qlocaleh_temp_file.write("// see qlocale_data_p.h for more info on generated data\n") - - # Language enum - qlocaleh_temp_file.write(" enum Language {\n") - language = None - for key, value in language_map.items(): - language = fixedLanguageName(value[0], dupes) - qlocaleh_temp_file.write(" " + language + " = " + str(key) + ",\n") - - qlocaleh_temp_file.write("\n " + - ",\n ".join('%s = %s' % pair - for pair in sorted(language_aliases.items())) + - ",\n") - qlocaleh_temp_file.write("\n") - qlocaleh_temp_file.write(" LastLanguage = " + language + "\n") - qlocaleh_temp_file.write(" };\n\n") - - # Script enum - qlocaleh_temp_file.write(" enum Script {\n") - script = None - for key, value in script_map.items(): - script = fixedScriptName(value[0], dupes) - qlocaleh_temp_file.write(" " + script + " = " + str(key) + ",\n") - qlocaleh_temp_file.write("\n " + - ",\n ".join('%s = %s' % pair - for pair in sorted(script_aliases.items())) + - ",\n") - qlocaleh_temp_file.write("\n") - qlocaleh_temp_file.write(" LastScript = " + script + "\n") - qlocaleh_temp_file.write(" };\n\n") - - # Country enum - qlocaleh_temp_file.write(" enum Country {\n") - country = None - for key, value in country_map.items(): - country = fixedCountryName(value[0], dupes) - qlocaleh_temp_file.write(" " + country + " = " + str(key) + ",\n") - qlocaleh_temp_file.write("\n " + - ",\n ".join('%s = %s' % pair - for pair in sorted(country_aliases.items())) + - ",\n") - qlocaleh_temp_file.write("\n") - qlocaleh_temp_file.write(" LastCountry = " + country + "\n") - qlocaleh_temp_file.write(" };\n") - - qlocaleh_temp_file.write(GENERATED_BLOCK_END) - s = qlocaleh_file.readline() - # skip until end of the old block - while s and s != GENERATED_BLOCK_END: - s = qlocaleh_file.readline() - - s = qlocaleh_file.readline() - while s: - qlocaleh_temp_file.write(s) - s = qlocaleh_file.readline() - qlocaleh_temp_file.close() - qlocaleh_file.close() - - os.remove(qtsrcdir + "/src/corelib/text/qlocale.h") - os.rename(qlocaleh_temp_file_path, qtsrcdir + "/src/corelib/text/qlocale.h") + try: + writer.languages(reader.languages) + writer.scripts(reader.scripts) + writer.countries(reader.countries) + except Error as e: + writer.cleanup() + err.write('\nError updating qlocale.h: ' + e.message + '\n') + return 1 + + writer.close() # qlocale.qdoc + try: + writer = Transcriber(os.path.join(qtsrcdir, 'src', 'corelib', 'text', 'qlocale.qdoc'), + qtsrcdir) + except IOError as e: + err.write('Failed to open files to transcribe qlocale.qdoc: ' + (e.message or e.args[1])) + return 1 - (qlocaleqdoc_temp_file, qlocaleqdoc_temp_file_path) = tempfile.mkstemp("qlocale.qdoc", dir=qtsrcdir) - qlocaleqdoc_temp_file = os.fdopen(qlocaleqdoc_temp_file, "w") - qlocaleqdoc_file = open(qtsrcdir + "/src/corelib/text/qlocale.qdoc", "r") - s = qlocaleqdoc_file.readline() DOCSTRING = " QLocale's data is based on Common Locale Data Repository " - while s: - if DOCSTRING in s: - qlocaleqdoc_temp_file.write(DOCSTRING + "v" + cldr_version + ".\n") - else: - qlocaleqdoc_temp_file.write(s) - s = qlocaleqdoc_file.readline() - qlocaleqdoc_temp_file.close() - qlocaleqdoc_file.close() - - os.remove(qtsrcdir + "/src/corelib/text/qlocale.qdoc") - os.rename(qlocaleqdoc_temp_file_path, qtsrcdir + "/src/corelib/text/qlocale.qdoc") + try: + for line in writer.reader: + if DOCSTRING in line: + writer.writer.write(DOCSTRING + 'v' + reader.cldrVersion + '.\n') + else: + writer.writer.write(line) + except Error as e: + writer.cleanup() + err.write('\nError updating qlocale.qdoc: ' + e.message + '\n') + return 1 + + writer.close() + return 0 if __name__ == "__main__": - main() + import sys + sys.exit(main(sys.argv, sys.stdout, sys.stderr)) diff --git a/util/locale_database/xpathlite.py b/util/locale_database/xpathlite.py deleted file mode 100644 index 97efaaab41..0000000000 --- a/util/locale_database/xpathlite.py +++ /dev/null @@ -1,288 +0,0 @@ -#!/usr/bin/env python -############################################################################# -## -## Copyright (C) 2016 The Qt Company Ltd. -## Contact: https://www.qt.io/licensing/ -## -## This file is part of the test suite of the Qt Toolkit. -## -## $QT_BEGIN_LICENSE:GPL-EXCEPT$ -## Commercial License Usage -## Licensees holding valid commercial Qt licenses may use this file in -## accordance with the commercial license agreement provided with the -## Software or, alternatively, in accordance with the terms contained in -## a written agreement between you and The Qt Company. For licensing terms -## and conditions see https://www.qt.io/terms-conditions. For further -## information use the contact form at https://www.qt.io/contact-us. -## -## GNU General Public License Usage -## Alternatively, this file may be used under the terms of the GNU -## General Public License version 3 as published by the Free Software -## Foundation with exceptions as appearing in the file LICENSE.GPL3-EXCEPT -## included in the packaging of this file. Please review the following -## information to ensure the GNU General Public License requirements will -## be met: https://www.gnu.org/licenses/gpl-3.0.html. -## -## $QT_END_LICENSE$ -## -############################################################################# - -import sys -import os -import xml.dom.minidom - -class DraftResolution: - # See http://www.unicode.org/cldr/process.html for description - unconfirmed = 'unconfirmed' - provisional = 'provisional' - contributed = 'contributed' - approved = 'approved' - _values = { unconfirmed : 1, provisional : 2, contributed : 3, approved : 4 } - def __init__(self, resolution): - self.resolution = resolution - def toInt(self): - return DraftResolution._values[self.resolution] - -class Error: - def __init__(self, msg): - self.msg = msg - def __str__(self): - return self.msg - -doc_cache = {} -def parseDoc(file): - if not doc_cache.has_key(file): - doc_cache[file] = xml.dom.minidom.parse(file) - return doc_cache[file] - -def findChild(parent, tag_name, arg_name=None, arg_value=None, draft=None): - for node in parent.childNodes: - if node.nodeType != node.ELEMENT_NODE: - continue - if node.nodeName != tag_name: - continue - if arg_value: - if not node.attributes.has_key(arg_name): - continue - if node.attributes[arg_name].nodeValue != arg_value: - continue - if draft: - if not node.attributes.has_key('draft'): - # if draft is not specified then it's approved - return node - value = node.attributes['draft'].nodeValue - value = DraftResolution(value).toInt() - exemplar = DraftResolution(draft).toInt() - if exemplar > value: - continue - return node - return False - -def codeMapsFromFile(file): - """Extract mappings of language, script and country codes to names. - - The file shall typically be common/main/en.xml, which contains a - localeDisplayNames element with children languages, scripts and - territories; each element in each of these has a code as its type - attribute and its name as element content. This returns a mapping - withe keys 'language', 'script' and 'country', each of which - has, as value, a mapping of the relevant codes to names. - """ - parent = findChild(findChild(parseDoc(file), 'ldml'), 'localeDisplayNames') - keys, result = {'languages': 'language', 'scripts': 'script', 'territories': 'country'}, {} - for src, dst in keys.items(): - child = findChild(parent, src) - data = result[dst] = {} - for elt in child.childNodes: - if elt.attributes and elt.attributes.has_key('type'): - key, value = elt.attributes['type'].value, elt.childNodes[0].wholeText - # Don't over-write previously-read data for an alt form: - if elt.attributes.has_key('alt') and data.has_key(key): - continue - data[key] = value - - return result - -def findTagsInFile(file, path): - doc = parseDoc(file) - - elt = doc.documentElement - tag_spec_list = path.split("/") - last_entry = None - for tag_spec in tag_spec_list: - tag_name = tag_spec - arg_name = 'type' - arg_value = '' - left_bracket = tag_spec.find('[') - if left_bracket != -1: - tag_name = tag_spec[:left_bracket] - arg_value = tag_spec[left_bracket+1:-1].split("=") - if len(arg_value) == 2: - arg_name = arg_value[0] - arg_value = arg_value[1] - else: - arg_value = arg_value[0] - elt = findChild(elt, tag_name, arg_name, arg_value) - if not elt: - return None - ret = [] - if elt.childNodes: - for node in elt.childNodes: - if node.attributes: - element = [node.nodeName, None] - element[1] = node.attributes.items() - ret.append(element) - else: - if elt.attributes: - element = [elt.nodeName, None] - element[1] = elt.attributes.items() - ret.append(element) - return ret - -def _findEntryInFile(file, path, draft=None, attribute=None): - doc = parseDoc(file) - - elt = doc.documentElement - tag_spec_list = path.split("/") - last_entry = None - for i in range(len(tag_spec_list)): - tag_spec = tag_spec_list[i] - tag_name = tag_spec - arg_name = 'type' - arg_value = '' - left_bracket = tag_spec.find('[') - if left_bracket != -1: - tag_name = tag_spec[:left_bracket] - arg_value = tag_spec[left_bracket+1:-1].split("=") - if len(arg_value) == 2: - arg_name = arg_value[0].replace("@", "").replace("'", "") - arg_value = arg_value[1] - else: - arg_value = arg_value[0] - alias = findChild(elt, 'alias') - if alias and alias.attributes['source'].nodeValue == 'locale': - path = alias.attributes['path'].nodeValue - aliaspath = tag_spec_list[:i] + path.split("/") - def resolve(x, y): - if y == '..': - return x[:-1] - return x + [y] - # resolve all dot-dot parts of the path - aliaspath = reduce(resolve, aliaspath, []) - # remove attribute specification that our xpathlite doesnt support - aliaspath = map(lambda x: x.replace("@type=", "").replace("'", ""), aliaspath) - # append the remaining path - aliaspath = aliaspath + tag_spec_list[i:] - aliaspath = "/".join(aliaspath) - # "locale" aliases are special - we need to start lookup from scratch - return (None, aliaspath) - elt = findChild(elt, tag_name, arg_name, arg_value, draft) - if not elt: - return ("", None) - if attribute is not None: - if elt.attributes.has_key(attribute): - return (elt.attributes[attribute].nodeValue, None) - return (None, None) - try: - return (elt.firstChild.nodeValue, None) - except: - pass - return (None, None) - -def findAlias(file): - doc = parseDoc(file) - - alias_elt = findChild(doc.documentElement, "alias") - if not alias_elt: - return False - if not alias_elt.attributes.has_key('source'): - return False - return alias_elt.attributes['source'].nodeValue - -lookup_chain_cache = {} -parent_locales = {} -def _fixedLookupChain(dirname, name): - if lookup_chain_cache.has_key(name): - return lookup_chain_cache[name] - - # see http://www.unicode.org/reports/tr35/#Parent_Locales - if not parent_locales: - for ns in findTagsInFile(dirname + "/../supplemental/supplementalData.xml", "parentLocales"): - tmp = {} - parent_locale = "" - for data in ns[1:][0]: # ns looks like this: [u'parentLocale', [(u'parent', u'root'), (u'locales', u'az_Cyrl bs_Cyrl en_Dsrt ..')]] - tmp[data[0]] = data[1] - if data[0] == u"parent": - parent_locale = data[1] - parent_locales[parent_locale] = tmp[u"locales"].split(" ") - - items = name.split("_") - # split locale name into items and iterate through them from back to front - # example: az_Latn_AZ => [az_Latn_AZ, az_Latn, az] - items = list(reversed(map(lambda x: "_".join(items[:x+1]), range(len(items))))) - - for i in range(len(items)): - item = items[i] - for parent_locale in parent_locales.keys(): - for locale in parent_locales[parent_locale]: - if item == locale: - if parent_locale == u"root": - items = items[:i+1] - else: - items = items[:i+1] + _fixedLookupChain(dirname, parent_locale) - lookup_chain_cache[name] = items - return items - - lookup_chain_cache[name] = items - return items - -def _findEntry(base, path, draft=None, attribute=None): - if base.endswith(".xml"): - base = base[:-4] - (dirname, filename) = os.path.split(base) - - items = _fixedLookupChain(dirname, filename) - for item in items: - file = dirname + "/" + item + ".xml" - if os.path.isfile(file): - alias = findAlias(file) - if alias: - # if alias is found we should follow it and stop processing current file - # see http://www.unicode.org/reports/tr35/#Common_Elements - aliasfile = os.path.dirname(file) + "/" + alias + ".xml" - if not os.path.isfile(aliasfile): - raise Error("findEntry: fatal error: found an alias '%s' to '%s', but the alias file couldn't be found" % (filename, alias)) - # found an alias, recurse into parsing it - result = _findEntry(aliasfile, path, draft, attribute) - return result - (result, aliaspath) = _findEntryInFile(file, path, draft, attribute) - if aliaspath: - # start lookup again because of the alias source="locale" - return _findEntry(base, aliaspath, draft, attribute) - if result: - return result - return None - -def findEntry(base, path, draft=None, attribute=None): - file = base - if base.endswith(".xml"): - file = base - base = base[:-4] - else: - file = base + ".xml" - (dirname, filename) = os.path.split(base) - - result = None - while path: - result = _findEntry(base, path, draft, attribute) - if result: - return result - (result, aliaspath) = _findEntryInFile(dirname + "/root.xml", path, draft, attribute) - if result: - return result - if not aliaspath: - raise Error("findEntry: fatal error: %s: cannot find key %s" % (filename, path)) - path = aliaspath - - return result - |