summaryrefslogtreecommitdiffstats
path: root/util/local_database
diff options
context:
space:
mode:
authorEdward Welbourne <edward.welbourne@qt.io>2018-08-13 15:21:58 +0200
committerEdward Welbourne <edward.welbourne@qt.io>2018-08-22 18:00:14 +0000
commitc1b15005c6594d0a0828190c858d9ab8ab3353b5 (patch)
tree1a3443dea95e6f087a635b06913ef5412e5edd9a /util/local_database
parentbbee6fac338b54a3f541bec06ef9bf554aa7f83a (diff)
Fix handling of default content locale data
We iterate theoretically many sources (albeit there's only really ever one) for this data, so accumulate instead of only keeping the last (and initialize it as the list it always ended up being, not a dictionary, so that this can work). The form of each token is a locale name, so it may be lang_Script just as readily as lang_LAND; so parse (and validate) the tags more faithfully to catch both cases. Abstract that parsing into a function and use it in both places that need it. Change-Id: Ibdbc4eafefab6a5ef70646d9fea150f2cb408d05 Reviewed-by: Jędrzej Nowacki <jedrzej.nowacki@qt.io>
Diffstat (limited to 'util/local_database')
-rwxr-xr-xutil/local_database/cldr2qlocalexml.py97
1 files changed, 61 insertions, 36 deletions
diff --git a/util/local_database/cldr2qlocalexml.py b/util/local_database/cldr2qlocalexml.py
index 8c730cdcde..69334b1dc3 100755
--- a/util/local_database/cldr2qlocalexml.py
+++ b/util/local_database/cldr2qlocalexml.py
@@ -438,6 +438,36 @@ def integrateWeekData(filePath):
else:
locale.weekendEnd = weekendEndByCountryCode["001"]
+def splitLocale(name):
+ """Split name into (language, script, territory) triple as generator.
+
+ Ignores any trailing fields, leaves script or territory empty if
+ unspecified, returns empty generator if no language found."""
+ tags = iter(name.split('_'))
+ yield tags.next() # Language
+ tag = tags.next()
+
+ # Script is always four letters, always capitalised:
+ if len(tag) == 4 and tag[0].isupper() and tag[1:].islower():
+ yield tag
+ try:
+ tag = tags.next()
+ except StopIteration:
+ tag = ''
+ else:
+ yield ''
+
+ # Territory is upper-case or numeric:
+ if tag and tag.isupper() or tag.isdigit():
+ yield tag
+ tag = ''
+ else:
+ yield ''
+
+ # If nothing is left, StopIteration will avoid the warning:
+ tag = (tag if tag else tags.next(),)
+ sys.stderr.write('Ignoring unparsed cruft %s in %s\n' % ('_'.join(tag + tuple(tags)), name))
+
if len(sys.argv) != 2:
usage()
@@ -451,30 +481,25 @@ cldr_files = os.listdir(cldr_dir)
locale_database = {}
# see http://www.unicode.org/reports/tr35/tr35-info.html#Default_Content
-defaultContent_locales = {}
+defaultContent_locales = []
for ns in findTagsInFile(os.path.join(cldr_dir, '..', 'supplemental',
'supplementalMetadata.xml'),
'metadata/defaultContent'):
for data in ns[1:][0]:
if data[0] == u"locales":
- defaultContent_locales = data[1].split()
+ defaultContent_locales += data[1].split()
for file in defaultContent_locales:
- items = file.split("_")
- if len(items) == 3:
- language_code = items[0]
- script_code = items[1]
- country_code = items[2]
- else:
- if len(items) != 2:
- sys.stderr.write('skipping defaultContent locale "' + file + '" [neither lang_script_country nor lang_country]\n')
- continue
- language_code = items[0]
- script_code = ""
- country_code = items[1]
- if len(country_code) == 4:
- sys.stderr.write('skipping defaultContent locale "' + file + '" [long country code]\n')
- continue
+ try:
+ language_code, script_code, country_code = splitLocale(file)
+ except ValueError:
+ sys.stderr.write('skipping defaultContent locale "' + file + '" [neither two nor three tags]\n')
+ continue
+
+ if not (script_code or country_code):
+ sys.stderr.write('skipping defaultContent locale "' + file + '" [second tag is neither script nor territory]\n')
+ continue
+
try:
l = _generateLocaleInfo(cldr_dir + "/" + file + ".xml", language_code, script_code, country_code)
if not l:
@@ -548,31 +573,31 @@ def _parseLocale(l):
if l == "und":
raise xpathlite.Error("we are treating unknown locale like C")
- items = l.split("_")
- language_code = items[0]
+ parsed = splitLocale(l)
+ language_code = parsed.next()
+ script_code = country_code = ''
+ try:
+ script_code, country_code = parsed
+ except ValueError:
+ pass
+
if language_code != "und":
language_id = enumdata.languageCodeToId(language_code)
if language_id == -1:
raise xpathlite.Error('unknown language code "%s"' % language_code)
language = enumdata.language_list[language_id][0]
- if len(items) > 1:
- script_code = items[1]
- country_code = ""
- if len(items) > 2:
- country_code = items[2]
- if len(script_code) == 4:
- script_id = enumdata.scriptCodeToId(script_code)
- if script_id == -1:
- raise xpathlite.Error('unknown script code "%s"' % script_code)
- script = enumdata.script_list[script_id][0]
- else:
- country_code = script_code
- if country_code:
- country_id = enumdata.countryCodeToId(country_code)
- if country_id == -1:
- raise xpathlite.Error('unknown country code "%s"' % country_code)
- country = enumdata.country_list[country_id][0]
+ if script_code:
+ script_id = enumdata.scriptCodeToId(script_code)
+ if script_id == -1:
+ raise xpathlite.Error('unknown script code "%s"' % script_code)
+ script = enumdata.script_list[script_id][0]
+
+ if country_code:
+ country_id = enumdata.countryCodeToId(country_code)
+ if country_id == -1:
+ raise xpathlite.Error('unknown country code "%s"' % country_code)
+ country = enumdata.country_list[country_id][0]
return (language, script, country)