diff options
Diffstat (limited to 'util/local_database/cldr2qlocalexml.py')
-rwxr-xr-x | util/local_database/cldr2qlocalexml.py | 139 |
1 files changed, 90 insertions, 49 deletions
diff --git a/util/local_database/cldr2qlocalexml.py b/util/local_database/cldr2qlocalexml.py index ce45f631a6..bc999e1b65 100755 --- a/util/local_database/cldr2qlocalexml.py +++ b/util/local_database/cldr2qlocalexml.py @@ -51,6 +51,7 @@ order. import os import sys import re +import textwrap import enumdata import xpathlite @@ -59,6 +60,10 @@ from dateconverter import convert_date from localexml import Locale findEntryInFile = xpathlite._findEntryInFile +def wrappedwarn(prefix, tokens): + return sys.stderr.write( + '\n'.join(textwrap.wrap(prefix + ', '.join(tokens), + subsequent_indent=' ', width=80)) + '\n') def parse_number_format(patterns, data): # this is a very limited parsing of the number format for currency only. @@ -164,9 +169,9 @@ def getNumberSystems(cache={}): entry = dict(ns[1]) name = entry[u'id'] if u'digits' in entry and ord(entry[u'digits'][0]) > 0xffff: - # FIXME: make this redundant: + # FIXME, QTBUG-69324: make this redundant: # omit number system if zero doesn't fit in single-char16 UTF-16 :-( - sys.stderr.write('skipping number system "%s" [can\'t represent its zero, U+%X, QTBUG-69324]\n' + sys.stderr.write('skipping number system "%s" [can\'t represent its zero, U+%X]\n' % (name, ord(entry[u'digits'][0]))) else: cache[name] = entry @@ -243,7 +248,7 @@ def _generateLocaleInfo(path, language_code, script_code, country_code, variant_ numbering_system = None try: numbering_system = findEntry(path, "numbers/defaultNumberingSystem") - except: + except xpathlite.Error: pass def findEntryDef(path, xpath, value=''): try: @@ -438,6 +443,38 @@ def integrateWeekData(filePath): else: locale.weekendEnd = weekendEndByCountryCode["001"] +def splitLocale(name): + """Split name into (language, script, territory) triple as generator. + + Ignores any trailing fields (with a warning), leaves script (a capitalised + four-letter token) or territory (either a number or an all-uppercase token) + empty if unspecified, returns a single-entry generator if name is a single + tag (i.e. contains no underscores). Always yields 1 or 3 values, never 2.""" + tags = iter(name.split('_')) + yield tags.next() # Language + tag = tags.next() + + # Script is always four letters, always capitalised: + if len(tag) == 4 and tag[0].isupper() and tag[1:].islower(): + yield tag + try: + tag = tags.next() + except StopIteration: + tag = '' + else: + yield '' + + # Territory is upper-case or numeric: + if tag and tag.isupper() or tag.isdigit(): + yield tag + tag = '' + else: + yield '' + + # If nothing is left, StopIteration will avoid the warning: + tag = (tag if tag else tags.next(),) + sys.stderr.write('Ignoring unparsed cruft %s in %s\n' % ('_'.join(tag + tuple(tags)), name)) + if len(sys.argv) != 2: usage() @@ -451,34 +488,30 @@ cldr_files = os.listdir(cldr_dir) locale_database = {} # see http://www.unicode.org/reports/tr35/tr35-info.html#Default_Content -defaultContent_locales = {} +defaultContent_locales = [] for ns in findTagsInFile(os.path.join(cldr_dir, '..', 'supplemental', 'supplementalMetadata.xml'), 'metadata/defaultContent'): for data in ns[1:][0]: if data[0] == u"locales": - defaultContent_locales = data[1].split() + defaultContent_locales += data[1].split() +skips = [] for file in defaultContent_locales: - items = file.split("_") - if len(items) == 3: - language_code = items[0] - script_code = items[1] - country_code = items[2] - else: - if len(items) != 2: - sys.stderr.write('skipping defaultContent locale "' + file + '" [neither lang_script_country nor lang_country]\n') - continue - language_code = items[0] - script_code = "" - country_code = items[1] - if len(country_code) == 4: - sys.stderr.write('skipping defaultContent locale "' + file + '" [long country code]\n') - continue + try: + language_code, script_code, country_code = splitLocale(file) + except ValueError: + sys.stderr.write('skipping defaultContent locale "' + file + '" [neither two nor three tags]\n') + continue + + if not (script_code or country_code): + sys.stderr.write('skipping defaultContent locale "' + file + '" [second tag is neither script nor territory]\n') + continue + try: l = _generateLocaleInfo(cldr_dir + "/" + file + ".xml", language_code, script_code, country_code) if not l: - sys.stderr.write('skipping defaultContent locale "' + file + '" [no locale info generated]\n') + skips.append(file) continue except xpathlite.Error as e: sys.stderr.write('skipping defaultContent locale "%s" (%s)\n' % (file, str(e))) @@ -486,11 +519,15 @@ for file in defaultContent_locales: locale_database[(l.language_id, l.script_id, l.country_id, l.variant_code)] = l +if skips: + wrappedwarn('skipping defaultContent locales [no locale info generated]: ', skips) + skips = [] + for file in cldr_files: try: l = generateLocaleInfo(cldr_dir + "/" + file) if not l: - sys.stderr.write('skipping file "' + file + '" [no locale info generated]\n') + skips.append(file) continue except xpathlite.Error as e: sys.stderr.write('skipping file "%s" (%s)\n' % (file, str(e))) @@ -498,6 +535,9 @@ for file in cldr_files: locale_database[(l.language_id, l.script_id, l.country_id, l.variant_code)] = l +if skips: + wrappedwarn('skipping files [no locale info generated]: ', skips) + integrateWeekData(cldr_dir+"/../supplemental/supplementalData.xml") locale_keys = locale_database.keys() locale_keys.sort() @@ -548,34 +588,35 @@ def _parseLocale(l): if l == "und": raise xpathlite.Error("we are treating unknown locale like C") - items = l.split("_") - language_code = items[0] + parsed = splitLocale(l) + language_code = parsed.next() + script_code = country_code = '' + try: + script_code, country_code = parsed + except ValueError: + pass + if language_code != "und": language_id = enumdata.languageCodeToId(language_code) if language_id == -1: raise xpathlite.Error('unknown language code "%s"' % language_code) language = enumdata.language_list[language_id][0] - if len(items) > 1: - script_code = items[1] - country_code = "" - if len(items) > 2: - country_code = items[2] - if len(script_code) == 4: - script_id = enumdata.scriptCodeToId(script_code) - if script_id == -1: - raise xpathlite.Error('unknown script code "%s"' % script_code) - script = enumdata.script_list[script_id][0] - else: - country_code = script_code - if country_code: - country_id = enumdata.countryCodeToId(country_code) - if country_id == -1: - raise xpathlite.Error('unknown country code "%s"' % country_code) - country = enumdata.country_list[country_id][0] + if script_code: + script_id = enumdata.scriptCodeToId(script_code) + if script_id == -1: + raise xpathlite.Error('unknown script code "%s"' % script_code) + script = enumdata.script_list[script_id][0] + + if country_code: + country_id = enumdata.countryCodeToId(country_code) + if country_id == -1: + raise xpathlite.Error('unknown country code "%s"' % country_code) + country = enumdata.country_list[country_id][0] return (language, script, country) +skips = [] print " <likelySubtags>" for ns in findTagsInFile(cldr_dir + "/../supplemental/likelySubtags.xml", "likelySubtags"): tmp = {} @@ -583,14 +624,13 @@ for ns in findTagsInFile(cldr_dir + "/../supplemental/likelySubtags.xml", "likel tmp[data[0]] = data[1] try: - (from_language, from_script, from_country) = _parseLocale(tmp[u"from"]) + from_language, from_script, from_country = _parseLocale(tmp[u"from"]) + to_language, to_script, to_country = _parseLocale(tmp[u"to"]) except xpathlite.Error as e: - sys.stderr.write('skipping likelySubtag "%s" -> "%s" (%s)\n' % (tmp[u"from"], tmp[u"to"], str(e))) - continue - try: - (to_language, to_script, to_country) = _parseLocale(tmp[u"to"]) - except xpathlite.Error as e: - sys.stderr.write('skipping likelySubtag "%s" -> "%s" (%s)\n' % (tmp[u"from"], tmp[u"to"], str(e))) + if tmp[u'to'].startswith(tmp[u'from']) and str(e) == 'unknown language code "%s"' % tmp[u'from']: + skips.append(tmp[u'to']) + else: + sys.stderr.write('skipping likelySubtag "%s" -> "%s" (%s)\n' % (tmp[u"from"], tmp[u"to"], str(e))) continue # substitute according to http://www.unicode.org/reports/tr35/#Likely_Subtags if to_country == "AnyCountry" and from_country != to_country: @@ -611,7 +651,8 @@ for ns in findTagsInFile(cldr_dir + "/../supplemental/likelySubtags.xml", "likel print " </to>" print " </likelySubtag>" print " </likelySubtags>" - +if skips: + wrappedwarn('skipping likelySubtags (for unknown language codes): ', skips) print " <localeList>" Locale.C().toXml() |