diff options
Diffstat (limited to 'util/locale_database/qlocalexml.py')
-rw-r--r-- | util/locale_database/qlocalexml.py | 368 |
1 files changed, 346 insertions, 22 deletions
diff --git a/util/locale_database/qlocalexml.py b/util/locale_database/qlocalexml.py index 0a4628e05e..550021ba01 100644 --- a/util/locale_database/qlocalexml.py +++ b/util/locale_database/qlocalexml.py @@ -28,11 +28,18 @@ ############################################################################# """Shared serialization-scanning code for QLocaleXML format. -The Locale class is written by cldr2qlocalexml.py and read by qlocalexml2cpp.py +Provides classes: + Locale -- common data-type representing one locale as a namespace + QLocaleXmlWriter -- helper to write a QLocaleXML file + QLocaleXmlReader -- helper to read a QLocaleXML file back in + +Support: + Spacer -- provides control over indentation of the output. """ +from __future__ import print_function from xml.sax.saxutils import escape -import xpathlite +from localetools import Error # Tools used by Locale: def camel(seq): @@ -43,6 +50,10 @@ def camel(seq): def camelCase(words): return ''.join(camel(iter(words))) +def addEscapes(s): + return ''.join(c if n < 128 else '\\x{:02x}'.format(n) + for n, c in ((ord(c), c) for c in s)) + def startCount(c, text): # strspn """First index in text where it doesn't have a character in c""" assert text and text[0] in c @@ -58,6 +69,8 @@ def convertFormat(format): * https://www.unicode.org/reports/tr35/tr35-dates.html#Date_Field_Symbol_Table * QDateTimeParser::parseFormat() and QLocalePrivate::dateTimeToString() """ + # Compare and contrast dateconverter.py's convert_date(). + # Need to (check consistency and) reduce redundancy ! result = "" i = 0 while i < len(format): @@ -102,7 +115,314 @@ def convertFormat(format): return result -class Locale: +class QLocaleXmlReader (object): + def __init__(self, filename): + self.root = self.__parse(filename) + # Lists of (id, name, code) triples: + languages = tuple(self.__loadMap('language')) + scripts = tuple(self.__loadMap('script')) + countries = tuple(self.__loadMap('country')) + self.__likely = tuple(self.__likelySubtagsMap()) + # Mappings {ID: (name, code)} + self.languages = dict((v[0], v[1:]) for v in languages) + self.scripts = dict((v[0], v[1:]) for v in scripts) + self.countries = dict((v[0], v[1:]) for v in countries) + # Private mappings {name: (ID, code)} + self.__langByName = dict((v[1], (v[0], v[2])) for v in languages) + self.__textByName = dict((v[1], (v[0], v[2])) for v in scripts) + self.__landByName = dict((v[1], (v[0], v[2])) for v in countries) + # Other properties: + self.dupes = set(v[1] for v in languages) & set(v[1] for v in countries) + self.cldrVersion = self.__firstChildText(self.root, "version") + + def loadLocaleMap(self, calendars, grumble = lambda text: None): + kid = self.__firstChildText + likely = dict(self.__likely) + for elt in self.__eachEltInGroup(self.root, 'localeList', 'locale'): + locale = Locale.fromXmlData(lambda k: kid(elt, k), calendars) + language = self.__langByName[locale.language][0] + script = self.__textByName[locale.script][0] + country = self.__landByName[locale.country][0] + + if language != 1: # C + if country == 0: + grumble('loadLocaleMap: No country id for "{}"\n'.format(locale.language)) + + if script == 0: + # Find default script for the given language and country - see: + # http://www.unicode.org/reports/tr35/#Likely_Subtags + try: + try: + to = likely[(locale.language, 'AnyScript', locale.country)] + except KeyError: + to = likely[(locale.language, 'AnyScript', 'AnyCountry')] + except KeyError: + pass + else: + locale.script = to[1] + script = self.__textByName[locale.script][0] + + yield (language, script, country), locale + + def languageIndices(self, locales): + index = 0 + for key, value in self.languages.iteritems(): + i, count = 0, locales.count(key) + if count > 0: + i = index + index += count + yield i, value[0] + + def likelyMap(self): + def tag(t): + lang, script, land = t + yield lang[1] if lang[0] else 'und' + if script[0]: yield script[1] + if land[0]: yield land[1] + + def ids(t): + return tuple(x[0] for x in t) + + for i, pair in enumerate(self.__likely, 1): + have = self.__fromNames(pair[0]) + give = self.__fromNames(pair[1]) + yield ('_'.join(tag(have)), ids(have), + '_'.join(tag(give)), ids(give), + i == len(self.__likely)) + + def defaultMap(self): + """Map language and script to their default country by ID. + + Yields ((language, script), country) wherever the likely + sub-tags mapping says language's default locale uses the given + script and country.""" + for have, give in self.__likely: + if have[1:] == ('AnyScript', 'AnyCountry') and give[2] != 'AnyCountry': + assert have[0] == give[0], (have, give) + yield ((self.__langByName[give[0]][0], + self.__textByName[give[1]][0]), + self.__landByName[give[2]][0]) + + # Implementation details: + def __loadMap(self, category): + kid = self.__firstChildText + for element in self.__eachEltInGroup(self.root, category + 'List', category): + yield int(kid(element, 'id')), kid(element, 'name'), kid(element, 'code') + + def __likelySubtagsMap(self): + def triplet(element, keys=('language', 'script', 'country'), kid = self.__firstChildText): + return tuple(kid(element, key) for key in keys) + + kid = self.__firstChildElt + for elt in self.__eachEltInGroup(self.root, 'likelySubtags', 'likelySubtag'): + yield triplet(kid(elt, "from")), triplet(kid(elt, "to")) + + def __fromNames(self, names): + return self.__langByName[names[0]], self.__textByName[names[1]], self.__landByName[names[2]] + + # DOM access: + from xml.dom import minidom + @staticmethod + def __parse(filename, read = minidom.parse): + return read(filename).documentElement + + @staticmethod + def __isNodeNamed(elt, name, TYPE=minidom.Node.ELEMENT_NODE): + return elt.nodeType == TYPE and elt.nodeName == name + del minidom + + @staticmethod + def __eltWords(elt): + child = elt.firstChild + while child: + if child.nodeType == elt.TEXT_NODE: + yield child.nodeValue + child = child.nextSibling + + @classmethod + def __firstChildElt(cls, parent, name): + child = parent.firstChild + while child: + if cls.__isNodeNamed(child, name): + return child + child = child.nextSibling + + raise Error('No {} child found'.format(name)) + + @classmethod + def __firstChildText(cls, elt, key): + return ' '.join(cls.__eltWords(cls.__firstChildElt(elt, key))) + + @classmethod + def __eachEltInGroup(cls, parent, group, key): + try: + element = cls.__firstChildElt(parent, group).firstChild + except Error: + element = None + + while element: + if cls.__isNodeNamed(element, key): + yield element + element = element.nextSibling + + +class Spacer (object): + def __init__(self, indent = None, initial = ''): + """Prepare to manage indentation and line breaks. + + Arguments are both optional. + + First argument, indent, is either None (its default, for + 'minifying'), an ingeter (number of spaces) or the unit of + text that is to be used for each indentation level (e.g. '\t' + to use tabs). If indent is None, no indentation is added, nor + are line-breaks; otherwise, self(text), for non-empty text, + shall end with a newline and begin with indentation. + + Second argument, initial, is the initial indentation; it is + ignored if indent is None. Indentation increases after each + call to self(text) in which text starts with a tag and doesn't + include its end-tag; indentation decreases if text starts with + an end-tag. The text is not parsed any more carefully than + just described. + """ + if indent is None: + self.__call = lambda x: x + else: + self.__each = ' ' * indent if isinstance(indent, int) else indent + self.current = initial + self.__call = self.__wrap + + def __wrap(self, line): + if not line: + return '\n' + + indent = self.current + if line.startswith('</'): + indent = self.current = indent[:-len(self.__each)] + elif line.startswith('<') and not line.startswith('<!'): + cut = line.find('>') + tag = (line[1:] if cut < 0 else line[1 : cut]).strip().split()[0] + if '</{}>'.format(tag) not in line: + self.current += self.__each + return indent + line + '\n' + + def __call__(self, line): + return self.__call(line) + +class QLocaleXmlWriter (object): + def __init__(self, save = None, space = Spacer(4)): + """Set up to write digested CLDR data as QLocale XML. + + Arguments are both optional. + + First argument, save, is None (its default) or a callable that + will write content to where you intend to save it. If None, it + is replaced with a callable that prints the given content, + suppressing the newline (but see the following); this is + equivalent to passing sys.stdout.write. + + Second argument, space, is an object to call on each text + output to prepend indentation and append newlines, or not as + the case may be. The default is a Spacer(4), which grows + indent by four spaces after each unmatched new tag and shrinks + back on a close-tag (its parsing is naive, but adequate to how + this class uses it), while adding a newline to each line. + """ + self.__rawOutput = self.__printit if save is None else save + self.__wrap = space + self.__write('<localeDatabase>') + + # Output of various sections, in their usual order: + def enumData(self, languages, scripts, countries): + self.__enumTable('languageList', languages) + self.__enumTable('scriptList', scripts) + self.__enumTable('countryList', countries) + + def likelySubTags(self, entries): + self.__openTag('likelySubtags') + for have, give in entries: + self.__openTag('likelySubtag') + self.__likelySubTag('from', have) + self.__likelySubTag('to', give) + self.__closeTag('likelySubtag') + self.__closeTag('likelySubtags') + + def locales(self, locales, calendars): + self.__openTag('localeList') + self.__openTag('locale') + Locale.C(calendars).toXml(self.inTag, calendars) + self.__closeTag('locale') + keys = locales.keys() + keys.sort() + for key in keys: + self.__openTag('locale') + locales[key].toXml(self.inTag, calendars) + self.__closeTag('locale') + self.__closeTag('localeList') + + def version(self, cldrVersion): + self.inTag('version', cldrVersion) + + def inTag(self, tag, text): + self.__write('<{0}>{1}</{0}>'.format(tag, text)) + + def close(self): + if self.__rawOutput != self.__complain: + self.__write('</localeDatabase>') + self.__rawOutput = self.__complain + + # Implementation details + @staticmethod + def __printit(text): + print(text, end='') + @staticmethod + def __complain(text): + raise Error('Attempted to write data after closing :-(') + + def __enumTable(self, tag, table): + self.__openTag(tag) + for key, value in table.iteritems(): + self.__openTag(tag[:-4]) + self.inTag('name', value[0]) + self.inTag('id', key) + self.inTag('code', value[1]) + self.__closeTag(tag[:-4]) + self.__closeTag(tag) + + def __likelySubTag(self, tag, likely): + self.__openTag(tag) + self.inTag('language', likely[0]) + self.inTag('script', likely[1]) + self.inTag('country', likely[2]) + # self.inTag('variant', likely[3]) + self.__closeTag(tag) + + def __openTag(self, tag): + self.__write('<{}>'.format(tag)) + def __closeTag(self, tag): + self.__write('</{}>'.format(tag)) + + def __write(self, line): + self.__rawOutput(self.__wrap(line)) + +class Locale (object): + """Holder for the assorted data representing one locale. + + Implemented as a namespace; its constructor and update() have the + same signatures as those of a dict, acting on the instance's + __dict__, so the results are accessed as attributes rather than + mapping keys.""" + def __init__(self, data=None, **kw): + self.update(data, **kw) + + def update(self, data=None, **kw): + if data: self.__dict__.update(data) + if kw: self.__dict__.update(kw) + + def __len__(self): # Used when testing as a boolean + return len(self.__dict__) + @staticmethod def propsMonthDay(scale, lengths=('long', 'short', 'narrow')): for L in lengths: @@ -158,16 +478,24 @@ class Locale: return cls(data) - def toXml(self, calendars=('gregorian',), indent=' ', tab=' '): - print indent + '<locale>' - inner = indent + tab + def toXml(self, write, calendars=('gregorian',)): + """Writes its data as QLocale XML. + + First argument, write, is a callable taking the name and + content of an XML element; it is expected to be the inTag + bound method of a QLocaleXmlWriter instance. + + Optional second argument is a list of calendar names, in the + form used by CLDR; its default is ('gregorian',). + """ get = lambda k: getattr(self, k) for key in ('language', 'script', 'country'): - print inner + "<%s>" % key + get(key) + "</%s>" % key - print inner + "<%scode>" % key + get(key + '_code') + "</%scode>" % key + write(key, get(key)) + write('{}code'.format(key), get('{}_code'.format(key))) - for key in ('decimal', 'group', 'zero', 'list', 'percent', 'minus', 'plus', 'exp'): - print inner + "<%s>" % key + get(key) + "</%s>" % key + for key in ('decimal', 'group', 'zero', 'list', + 'percent', 'minus', 'plus', 'exp'): + write(key, get(key)) for key in ('languageEndonym', 'countryEndonym', 'quotationStart', 'quotationEnd', @@ -185,16 +513,10 @@ class Locale: '_'.join((k, cal)) for k in self.propsMonthDay('months') for cal in calendars): - print inner + "<%s>%s</%s>" % (key, escape(get(key)).encode('utf-8'), key) + write(key, escape(get(key)).encode('utf-8')) for key in ('currencyDigits', 'currencyRounding'): - print inner + "<%s>%d</%s>" % (key, get(key), key) - - print indent + "</locale>" - - def __init__(self, data=None, **kw): - if data: self.__dict__.update(data) - if kw: self.__dict__.update(kw) + write(key, get(key)) # Tools used by __monthNames: def fullName(i, name): return name @@ -213,6 +535,9 @@ class Locale: @staticmethod def __monthNames(calendars, known={ # Map calendar to (names, extractors...): + # TODO: do we even need these ? CLDR's root.xml seems to + # have them, complete with yeartype="leap" handling for + # Hebrew's extra. 'gregorian': (('January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December'), # Extractor pairs, (plain, standalone) @@ -240,8 +565,8 @@ class Locale: for cal in calendars: try: data = known[cal] - except KeyError: # Need to add an entry to known, above. - print 'Unsupported calendar:', cal + except KeyError as e: # Need to add an entry to known, above. + e.args += ('Unsupported calendar:', cal) raise names, get = data[0], data[1:] for n, size in enumerate(sizes): @@ -253,12 +578,11 @@ class Locale: @classmethod def C(cls, calendars=('gregorian',), - # Empty entry at end to ensure final separator when join()ed: days = ('Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday'), quantifiers=('k', 'M', 'G', 'T', 'P', 'E')): """Returns an object representing the C locale.""" - return cls(dict(cls.__monthNames(calendars)), + return cls(cls.__monthNames(calendars), language='C', language_code='0', languageEndonym='', script='AnyScript', script_code='0', country='AnyCountry', country_code='0', countryEndonym='', |