summaryrefslogtreecommitdiffstats
path: root/util/locale_database/qlocalexml.py
diff options
context:
space:
mode:
Diffstat (limited to 'util/locale_database/qlocalexml.py')
-rw-r--r--util/locale_database/qlocalexml.py368
1 files changed, 346 insertions, 22 deletions
diff --git a/util/locale_database/qlocalexml.py b/util/locale_database/qlocalexml.py
index 0a4628e05e..550021ba01 100644
--- a/util/locale_database/qlocalexml.py
+++ b/util/locale_database/qlocalexml.py
@@ -28,11 +28,18 @@
#############################################################################
"""Shared serialization-scanning code for QLocaleXML format.
-The Locale class is written by cldr2qlocalexml.py and read by qlocalexml2cpp.py
+Provides classes:
+ Locale -- common data-type representing one locale as a namespace
+ QLocaleXmlWriter -- helper to write a QLocaleXML file
+ QLocaleXmlReader -- helper to read a QLocaleXML file back in
+
+Support:
+ Spacer -- provides control over indentation of the output.
"""
+from __future__ import print_function
from xml.sax.saxutils import escape
-import xpathlite
+from localetools import Error
# Tools used by Locale:
def camel(seq):
@@ -43,6 +50,10 @@ def camel(seq):
def camelCase(words):
return ''.join(camel(iter(words)))
+def addEscapes(s):
+ return ''.join(c if n < 128 else '\\x{:02x}'.format(n)
+ for n, c in ((ord(c), c) for c in s))
+
def startCount(c, text): # strspn
"""First index in text where it doesn't have a character in c"""
assert text and text[0] in c
@@ -58,6 +69,8 @@ def convertFormat(format):
* https://www.unicode.org/reports/tr35/tr35-dates.html#Date_Field_Symbol_Table
* QDateTimeParser::parseFormat() and QLocalePrivate::dateTimeToString()
"""
+ # Compare and contrast dateconverter.py's convert_date().
+ # Need to (check consistency and) reduce redundancy !
result = ""
i = 0
while i < len(format):
@@ -102,7 +115,314 @@ def convertFormat(format):
return result
-class Locale:
+class QLocaleXmlReader (object):
+ def __init__(self, filename):
+ self.root = self.__parse(filename)
+ # Lists of (id, name, code) triples:
+ languages = tuple(self.__loadMap('language'))
+ scripts = tuple(self.__loadMap('script'))
+ countries = tuple(self.__loadMap('country'))
+ self.__likely = tuple(self.__likelySubtagsMap())
+ # Mappings {ID: (name, code)}
+ self.languages = dict((v[0], v[1:]) for v in languages)
+ self.scripts = dict((v[0], v[1:]) for v in scripts)
+ self.countries = dict((v[0], v[1:]) for v in countries)
+ # Private mappings {name: (ID, code)}
+ self.__langByName = dict((v[1], (v[0], v[2])) for v in languages)
+ self.__textByName = dict((v[1], (v[0], v[2])) for v in scripts)
+ self.__landByName = dict((v[1], (v[0], v[2])) for v in countries)
+ # Other properties:
+ self.dupes = set(v[1] for v in languages) & set(v[1] for v in countries)
+ self.cldrVersion = self.__firstChildText(self.root, "version")
+
+ def loadLocaleMap(self, calendars, grumble = lambda text: None):
+ kid = self.__firstChildText
+ likely = dict(self.__likely)
+ for elt in self.__eachEltInGroup(self.root, 'localeList', 'locale'):
+ locale = Locale.fromXmlData(lambda k: kid(elt, k), calendars)
+ language = self.__langByName[locale.language][0]
+ script = self.__textByName[locale.script][0]
+ country = self.__landByName[locale.country][0]
+
+ if language != 1: # C
+ if country == 0:
+ grumble('loadLocaleMap: No country id for "{}"\n'.format(locale.language))
+
+ if script == 0:
+ # Find default script for the given language and country - see:
+ # http://www.unicode.org/reports/tr35/#Likely_Subtags
+ try:
+ try:
+ to = likely[(locale.language, 'AnyScript', locale.country)]
+ except KeyError:
+ to = likely[(locale.language, 'AnyScript', 'AnyCountry')]
+ except KeyError:
+ pass
+ else:
+ locale.script = to[1]
+ script = self.__textByName[locale.script][0]
+
+ yield (language, script, country), locale
+
+ def languageIndices(self, locales):
+ index = 0
+ for key, value in self.languages.iteritems():
+ i, count = 0, locales.count(key)
+ if count > 0:
+ i = index
+ index += count
+ yield i, value[0]
+
+ def likelyMap(self):
+ def tag(t):
+ lang, script, land = t
+ yield lang[1] if lang[0] else 'und'
+ if script[0]: yield script[1]
+ if land[0]: yield land[1]
+
+ def ids(t):
+ return tuple(x[0] for x in t)
+
+ for i, pair in enumerate(self.__likely, 1):
+ have = self.__fromNames(pair[0])
+ give = self.__fromNames(pair[1])
+ yield ('_'.join(tag(have)), ids(have),
+ '_'.join(tag(give)), ids(give),
+ i == len(self.__likely))
+
+ def defaultMap(self):
+ """Map language and script to their default country by ID.
+
+ Yields ((language, script), country) wherever the likely
+ sub-tags mapping says language's default locale uses the given
+ script and country."""
+ for have, give in self.__likely:
+ if have[1:] == ('AnyScript', 'AnyCountry') and give[2] != 'AnyCountry':
+ assert have[0] == give[0], (have, give)
+ yield ((self.__langByName[give[0]][0],
+ self.__textByName[give[1]][0]),
+ self.__landByName[give[2]][0])
+
+ # Implementation details:
+ def __loadMap(self, category):
+ kid = self.__firstChildText
+ for element in self.__eachEltInGroup(self.root, category + 'List', category):
+ yield int(kid(element, 'id')), kid(element, 'name'), kid(element, 'code')
+
+ def __likelySubtagsMap(self):
+ def triplet(element, keys=('language', 'script', 'country'), kid = self.__firstChildText):
+ return tuple(kid(element, key) for key in keys)
+
+ kid = self.__firstChildElt
+ for elt in self.__eachEltInGroup(self.root, 'likelySubtags', 'likelySubtag'):
+ yield triplet(kid(elt, "from")), triplet(kid(elt, "to"))
+
+ def __fromNames(self, names):
+ return self.__langByName[names[0]], self.__textByName[names[1]], self.__landByName[names[2]]
+
+ # DOM access:
+ from xml.dom import minidom
+ @staticmethod
+ def __parse(filename, read = minidom.parse):
+ return read(filename).documentElement
+
+ @staticmethod
+ def __isNodeNamed(elt, name, TYPE=minidom.Node.ELEMENT_NODE):
+ return elt.nodeType == TYPE and elt.nodeName == name
+ del minidom
+
+ @staticmethod
+ def __eltWords(elt):
+ child = elt.firstChild
+ while child:
+ if child.nodeType == elt.TEXT_NODE:
+ yield child.nodeValue
+ child = child.nextSibling
+
+ @classmethod
+ def __firstChildElt(cls, parent, name):
+ child = parent.firstChild
+ while child:
+ if cls.__isNodeNamed(child, name):
+ return child
+ child = child.nextSibling
+
+ raise Error('No {} child found'.format(name))
+
+ @classmethod
+ def __firstChildText(cls, elt, key):
+ return ' '.join(cls.__eltWords(cls.__firstChildElt(elt, key)))
+
+ @classmethod
+ def __eachEltInGroup(cls, parent, group, key):
+ try:
+ element = cls.__firstChildElt(parent, group).firstChild
+ except Error:
+ element = None
+
+ while element:
+ if cls.__isNodeNamed(element, key):
+ yield element
+ element = element.nextSibling
+
+
+class Spacer (object):
+ def __init__(self, indent = None, initial = ''):
+ """Prepare to manage indentation and line breaks.
+
+ Arguments are both optional.
+
+ First argument, indent, is either None (its default, for
+ 'minifying'), an ingeter (number of spaces) or the unit of
+ text that is to be used for each indentation level (e.g. '\t'
+ to use tabs). If indent is None, no indentation is added, nor
+ are line-breaks; otherwise, self(text), for non-empty text,
+ shall end with a newline and begin with indentation.
+
+ Second argument, initial, is the initial indentation; it is
+ ignored if indent is None. Indentation increases after each
+ call to self(text) in which text starts with a tag and doesn't
+ include its end-tag; indentation decreases if text starts with
+ an end-tag. The text is not parsed any more carefully than
+ just described.
+ """
+ if indent is None:
+ self.__call = lambda x: x
+ else:
+ self.__each = ' ' * indent if isinstance(indent, int) else indent
+ self.current = initial
+ self.__call = self.__wrap
+
+ def __wrap(self, line):
+ if not line:
+ return '\n'
+
+ indent = self.current
+ if line.startswith('</'):
+ indent = self.current = indent[:-len(self.__each)]
+ elif line.startswith('<') and not line.startswith('<!'):
+ cut = line.find('>')
+ tag = (line[1:] if cut < 0 else line[1 : cut]).strip().split()[0]
+ if '</{}>'.format(tag) not in line:
+ self.current += self.__each
+ return indent + line + '\n'
+
+ def __call__(self, line):
+ return self.__call(line)
+
+class QLocaleXmlWriter (object):
+ def __init__(self, save = None, space = Spacer(4)):
+ """Set up to write digested CLDR data as QLocale XML.
+
+ Arguments are both optional.
+
+ First argument, save, is None (its default) or a callable that
+ will write content to where you intend to save it. If None, it
+ is replaced with a callable that prints the given content,
+ suppressing the newline (but see the following); this is
+ equivalent to passing sys.stdout.write.
+
+ Second argument, space, is an object to call on each text
+ output to prepend indentation and append newlines, or not as
+ the case may be. The default is a Spacer(4), which grows
+ indent by four spaces after each unmatched new tag and shrinks
+ back on a close-tag (its parsing is naive, but adequate to how
+ this class uses it), while adding a newline to each line.
+ """
+ self.__rawOutput = self.__printit if save is None else save
+ self.__wrap = space
+ self.__write('<localeDatabase>')
+
+ # Output of various sections, in their usual order:
+ def enumData(self, languages, scripts, countries):
+ self.__enumTable('languageList', languages)
+ self.__enumTable('scriptList', scripts)
+ self.__enumTable('countryList', countries)
+
+ def likelySubTags(self, entries):
+ self.__openTag('likelySubtags')
+ for have, give in entries:
+ self.__openTag('likelySubtag')
+ self.__likelySubTag('from', have)
+ self.__likelySubTag('to', give)
+ self.__closeTag('likelySubtag')
+ self.__closeTag('likelySubtags')
+
+ def locales(self, locales, calendars):
+ self.__openTag('localeList')
+ self.__openTag('locale')
+ Locale.C(calendars).toXml(self.inTag, calendars)
+ self.__closeTag('locale')
+ keys = locales.keys()
+ keys.sort()
+ for key in keys:
+ self.__openTag('locale')
+ locales[key].toXml(self.inTag, calendars)
+ self.__closeTag('locale')
+ self.__closeTag('localeList')
+
+ def version(self, cldrVersion):
+ self.inTag('version', cldrVersion)
+
+ def inTag(self, tag, text):
+ self.__write('<{0}>{1}</{0}>'.format(tag, text))
+
+ def close(self):
+ if self.__rawOutput != self.__complain:
+ self.__write('</localeDatabase>')
+ self.__rawOutput = self.__complain
+
+ # Implementation details
+ @staticmethod
+ def __printit(text):
+ print(text, end='')
+ @staticmethod
+ def __complain(text):
+ raise Error('Attempted to write data after closing :-(')
+
+ def __enumTable(self, tag, table):
+ self.__openTag(tag)
+ for key, value in table.iteritems():
+ self.__openTag(tag[:-4])
+ self.inTag('name', value[0])
+ self.inTag('id', key)
+ self.inTag('code', value[1])
+ self.__closeTag(tag[:-4])
+ self.__closeTag(tag)
+
+ def __likelySubTag(self, tag, likely):
+ self.__openTag(tag)
+ self.inTag('language', likely[0])
+ self.inTag('script', likely[1])
+ self.inTag('country', likely[2])
+ # self.inTag('variant', likely[3])
+ self.__closeTag(tag)
+
+ def __openTag(self, tag):
+ self.__write('<{}>'.format(tag))
+ def __closeTag(self, tag):
+ self.__write('</{}>'.format(tag))
+
+ def __write(self, line):
+ self.__rawOutput(self.__wrap(line))
+
+class Locale (object):
+ """Holder for the assorted data representing one locale.
+
+ Implemented as a namespace; its constructor and update() have the
+ same signatures as those of a dict, acting on the instance's
+ __dict__, so the results are accessed as attributes rather than
+ mapping keys."""
+ def __init__(self, data=None, **kw):
+ self.update(data, **kw)
+
+ def update(self, data=None, **kw):
+ if data: self.__dict__.update(data)
+ if kw: self.__dict__.update(kw)
+
+ def __len__(self): # Used when testing as a boolean
+ return len(self.__dict__)
+
@staticmethod
def propsMonthDay(scale, lengths=('long', 'short', 'narrow')):
for L in lengths:
@@ -158,16 +478,24 @@ class Locale:
return cls(data)
- def toXml(self, calendars=('gregorian',), indent=' ', tab=' '):
- print indent + '<locale>'
- inner = indent + tab
+ def toXml(self, write, calendars=('gregorian',)):
+ """Writes its data as QLocale XML.
+
+ First argument, write, is a callable taking the name and
+ content of an XML element; it is expected to be the inTag
+ bound method of a QLocaleXmlWriter instance.
+
+ Optional second argument is a list of calendar names, in the
+ form used by CLDR; its default is ('gregorian',).
+ """
get = lambda k: getattr(self, k)
for key in ('language', 'script', 'country'):
- print inner + "<%s>" % key + get(key) + "</%s>" % key
- print inner + "<%scode>" % key + get(key + '_code') + "</%scode>" % key
+ write(key, get(key))
+ write('{}code'.format(key), get('{}_code'.format(key)))
- for key in ('decimal', 'group', 'zero', 'list', 'percent', 'minus', 'plus', 'exp'):
- print inner + "<%s>" % key + get(key) + "</%s>" % key
+ for key in ('decimal', 'group', 'zero', 'list',
+ 'percent', 'minus', 'plus', 'exp'):
+ write(key, get(key))
for key in ('languageEndonym', 'countryEndonym',
'quotationStart', 'quotationEnd',
@@ -185,16 +513,10 @@ class Locale:
'_'.join((k, cal))
for k in self.propsMonthDay('months')
for cal in calendars):
- print inner + "<%s>%s</%s>" % (key, escape(get(key)).encode('utf-8'), key)
+ write(key, escape(get(key)).encode('utf-8'))
for key in ('currencyDigits', 'currencyRounding'):
- print inner + "<%s>%d</%s>" % (key, get(key), key)
-
- print indent + "</locale>"
-
- def __init__(self, data=None, **kw):
- if data: self.__dict__.update(data)
- if kw: self.__dict__.update(kw)
+ write(key, get(key))
# Tools used by __monthNames:
def fullName(i, name): return name
@@ -213,6 +535,9 @@ class Locale:
@staticmethod
def __monthNames(calendars,
known={ # Map calendar to (names, extractors...):
+ # TODO: do we even need these ? CLDR's root.xml seems to
+ # have them, complete with yeartype="leap" handling for
+ # Hebrew's extra.
'gregorian': (('January', 'February', 'March', 'April', 'May', 'June', 'July',
'August', 'September', 'October', 'November', 'December'),
# Extractor pairs, (plain, standalone)
@@ -240,8 +565,8 @@ class Locale:
for cal in calendars:
try:
data = known[cal]
- except KeyError: # Need to add an entry to known, above.
- print 'Unsupported calendar:', cal
+ except KeyError as e: # Need to add an entry to known, above.
+ e.args += ('Unsupported calendar:', cal)
raise
names, get = data[0], data[1:]
for n, size in enumerate(sizes):
@@ -253,12 +578,11 @@ class Locale:
@classmethod
def C(cls, calendars=('gregorian',),
- # Empty entry at end to ensure final separator when join()ed:
days = ('Sunday', 'Monday', 'Tuesday', 'Wednesday',
'Thursday', 'Friday', 'Saturday'),
quantifiers=('k', 'M', 'G', 'T', 'P', 'E')):
"""Returns an object representing the C locale."""
- return cls(dict(cls.__monthNames(calendars)),
+ return cls(cls.__monthNames(calendars),
language='C', language_code='0', languageEndonym='',
script='AnyScript', script_code='0',
country='AnyCountry', country_code='0', countryEndonym='',