summaryrefslogtreecommitdiffstats
path: root/util/local_database/cldr2qlocalexml.py
diff options
context:
space:
mode:
Diffstat (limited to 'util/local_database/cldr2qlocalexml.py')
-rwxr-xr-xutil/local_database/cldr2qlocalexml.py139
1 files changed, 90 insertions, 49 deletions
diff --git a/util/local_database/cldr2qlocalexml.py b/util/local_database/cldr2qlocalexml.py
index ce45f631a6..bc999e1b65 100755
--- a/util/local_database/cldr2qlocalexml.py
+++ b/util/local_database/cldr2qlocalexml.py
@@ -51,6 +51,7 @@ order.
import os
import sys
import re
+import textwrap
import enumdata
import xpathlite
@@ -59,6 +60,10 @@ from dateconverter import convert_date
from localexml import Locale
findEntryInFile = xpathlite._findEntryInFile
+def wrappedwarn(prefix, tokens):
+ return sys.stderr.write(
+ '\n'.join(textwrap.wrap(prefix + ', '.join(tokens),
+ subsequent_indent=' ', width=80)) + '\n')
def parse_number_format(patterns, data):
# this is a very limited parsing of the number format for currency only.
@@ -164,9 +169,9 @@ def getNumberSystems(cache={}):
entry = dict(ns[1])
name = entry[u'id']
if u'digits' in entry and ord(entry[u'digits'][0]) > 0xffff:
- # FIXME: make this redundant:
+ # FIXME, QTBUG-69324: make this redundant:
# omit number system if zero doesn't fit in single-char16 UTF-16 :-(
- sys.stderr.write('skipping number system "%s" [can\'t represent its zero, U+%X, QTBUG-69324]\n'
+ sys.stderr.write('skipping number system "%s" [can\'t represent its zero, U+%X]\n'
% (name, ord(entry[u'digits'][0])))
else:
cache[name] = entry
@@ -243,7 +248,7 @@ def _generateLocaleInfo(path, language_code, script_code, country_code, variant_
numbering_system = None
try:
numbering_system = findEntry(path, "numbers/defaultNumberingSystem")
- except:
+ except xpathlite.Error:
pass
def findEntryDef(path, xpath, value=''):
try:
@@ -438,6 +443,38 @@ def integrateWeekData(filePath):
else:
locale.weekendEnd = weekendEndByCountryCode["001"]
+def splitLocale(name):
+ """Split name into (language, script, territory) triple as generator.
+
+ Ignores any trailing fields (with a warning), leaves script (a capitalised
+ four-letter token) or territory (either a number or an all-uppercase token)
+ empty if unspecified, returns a single-entry generator if name is a single
+ tag (i.e. contains no underscores). Always yields 1 or 3 values, never 2."""
+ tags = iter(name.split('_'))
+ yield tags.next() # Language
+ tag = tags.next()
+
+ # Script is always four letters, always capitalised:
+ if len(tag) == 4 and tag[0].isupper() and tag[1:].islower():
+ yield tag
+ try:
+ tag = tags.next()
+ except StopIteration:
+ tag = ''
+ else:
+ yield ''
+
+ # Territory is upper-case or numeric:
+ if tag and tag.isupper() or tag.isdigit():
+ yield tag
+ tag = ''
+ else:
+ yield ''
+
+ # If nothing is left, StopIteration will avoid the warning:
+ tag = (tag if tag else tags.next(),)
+ sys.stderr.write('Ignoring unparsed cruft %s in %s\n' % ('_'.join(tag + tuple(tags)), name))
+
if len(sys.argv) != 2:
usage()
@@ -451,34 +488,30 @@ cldr_files = os.listdir(cldr_dir)
locale_database = {}
# see http://www.unicode.org/reports/tr35/tr35-info.html#Default_Content
-defaultContent_locales = {}
+defaultContent_locales = []
for ns in findTagsInFile(os.path.join(cldr_dir, '..', 'supplemental',
'supplementalMetadata.xml'),
'metadata/defaultContent'):
for data in ns[1:][0]:
if data[0] == u"locales":
- defaultContent_locales = data[1].split()
+ defaultContent_locales += data[1].split()
+skips = []
for file in defaultContent_locales:
- items = file.split("_")
- if len(items) == 3:
- language_code = items[0]
- script_code = items[1]
- country_code = items[2]
- else:
- if len(items) != 2:
- sys.stderr.write('skipping defaultContent locale "' + file + '" [neither lang_script_country nor lang_country]\n')
- continue
- language_code = items[0]
- script_code = ""
- country_code = items[1]
- if len(country_code) == 4:
- sys.stderr.write('skipping defaultContent locale "' + file + '" [long country code]\n')
- continue
+ try:
+ language_code, script_code, country_code = splitLocale(file)
+ except ValueError:
+ sys.stderr.write('skipping defaultContent locale "' + file + '" [neither two nor three tags]\n')
+ continue
+
+ if not (script_code or country_code):
+ sys.stderr.write('skipping defaultContent locale "' + file + '" [second tag is neither script nor territory]\n')
+ continue
+
try:
l = _generateLocaleInfo(cldr_dir + "/" + file + ".xml", language_code, script_code, country_code)
if not l:
- sys.stderr.write('skipping defaultContent locale "' + file + '" [no locale info generated]\n')
+ skips.append(file)
continue
except xpathlite.Error as e:
sys.stderr.write('skipping defaultContent locale "%s" (%s)\n' % (file, str(e)))
@@ -486,11 +519,15 @@ for file in defaultContent_locales:
locale_database[(l.language_id, l.script_id, l.country_id, l.variant_code)] = l
+if skips:
+ wrappedwarn('skipping defaultContent locales [no locale info generated]: ', skips)
+ skips = []
+
for file in cldr_files:
try:
l = generateLocaleInfo(cldr_dir + "/" + file)
if not l:
- sys.stderr.write('skipping file "' + file + '" [no locale info generated]\n')
+ skips.append(file)
continue
except xpathlite.Error as e:
sys.stderr.write('skipping file "%s" (%s)\n' % (file, str(e)))
@@ -498,6 +535,9 @@ for file in cldr_files:
locale_database[(l.language_id, l.script_id, l.country_id, l.variant_code)] = l
+if skips:
+ wrappedwarn('skipping files [no locale info generated]: ', skips)
+
integrateWeekData(cldr_dir+"/../supplemental/supplementalData.xml")
locale_keys = locale_database.keys()
locale_keys.sort()
@@ -548,34 +588,35 @@ def _parseLocale(l):
if l == "und":
raise xpathlite.Error("we are treating unknown locale like C")
- items = l.split("_")
- language_code = items[0]
+ parsed = splitLocale(l)
+ language_code = parsed.next()
+ script_code = country_code = ''
+ try:
+ script_code, country_code = parsed
+ except ValueError:
+ pass
+
if language_code != "und":
language_id = enumdata.languageCodeToId(language_code)
if language_id == -1:
raise xpathlite.Error('unknown language code "%s"' % language_code)
language = enumdata.language_list[language_id][0]
- if len(items) > 1:
- script_code = items[1]
- country_code = ""
- if len(items) > 2:
- country_code = items[2]
- if len(script_code) == 4:
- script_id = enumdata.scriptCodeToId(script_code)
- if script_id == -1:
- raise xpathlite.Error('unknown script code "%s"' % script_code)
- script = enumdata.script_list[script_id][0]
- else:
- country_code = script_code
- if country_code:
- country_id = enumdata.countryCodeToId(country_code)
- if country_id == -1:
- raise xpathlite.Error('unknown country code "%s"' % country_code)
- country = enumdata.country_list[country_id][0]
+ if script_code:
+ script_id = enumdata.scriptCodeToId(script_code)
+ if script_id == -1:
+ raise xpathlite.Error('unknown script code "%s"' % script_code)
+ script = enumdata.script_list[script_id][0]
+
+ if country_code:
+ country_id = enumdata.countryCodeToId(country_code)
+ if country_id == -1:
+ raise xpathlite.Error('unknown country code "%s"' % country_code)
+ country = enumdata.country_list[country_id][0]
return (language, script, country)
+skips = []
print " <likelySubtags>"
for ns in findTagsInFile(cldr_dir + "/../supplemental/likelySubtags.xml", "likelySubtags"):
tmp = {}
@@ -583,14 +624,13 @@ for ns in findTagsInFile(cldr_dir + "/../supplemental/likelySubtags.xml", "likel
tmp[data[0]] = data[1]
try:
- (from_language, from_script, from_country) = _parseLocale(tmp[u"from"])
+ from_language, from_script, from_country = _parseLocale(tmp[u"from"])
+ to_language, to_script, to_country = _parseLocale(tmp[u"to"])
except xpathlite.Error as e:
- sys.stderr.write('skipping likelySubtag "%s" -> "%s" (%s)\n' % (tmp[u"from"], tmp[u"to"], str(e)))
- continue
- try:
- (to_language, to_script, to_country) = _parseLocale(tmp[u"to"])
- except xpathlite.Error as e:
- sys.stderr.write('skipping likelySubtag "%s" -> "%s" (%s)\n' % (tmp[u"from"], tmp[u"to"], str(e)))
+ if tmp[u'to'].startswith(tmp[u'from']) and str(e) == 'unknown language code "%s"' % tmp[u'from']:
+ skips.append(tmp[u'to'])
+ else:
+ sys.stderr.write('skipping likelySubtag "%s" -> "%s" (%s)\n' % (tmp[u"from"], tmp[u"to"], str(e)))
continue
# substitute according to http://www.unicode.org/reports/tr35/#Likely_Subtags
if to_country == "AnyCountry" and from_country != to_country:
@@ -611,7 +651,8 @@ for ns in findTagsInFile(cldr_dir + "/../supplemental/likelySubtags.xml", "likel
print " </to>"
print " </likelySubtag>"
print " </likelySubtags>"
-
+if skips:
+ wrappedwarn('skipping likelySubtags (for unknown language codes): ', skips)
print " <localeList>"
Locale.C().toXml()