Fix handling of default content locale data

We iterate theoretically many sources (albeit there's only really ever one) for this data, so accumulate instead of only keeping the last (and initialize it as the list it always ended up being, not a dictionary, so that this can work). The form of each token is a locale name, so it may be lang_Script just as readily as lang_LAND; so parse (and validate) the tags more faithfully to catch both cases. Abstract that parsing into a function and use it in both places that need it. Change-Id: Ibdbc4eafefab6a5ef70646d9fea150f2cb408d05 Reviewed-by: Jędrzej Nowacki <jedrzej.nowacki@qt.io>
author: Edward Welbourne <edward.welbourne@qt.io> 2018-08-13 15:21:58 +0200
committer: Edward Welbourne <edward.welbourne@qt.io> 2018-08-22 18:00:14 +0000
commit: c1b15005c6594d0a0828190c858d9ab8ab3353b5 (patch)
tree: 1a3443dea95e6f087a635b06913ef5412e5edd9a /util/local_database
parent: bbee6fac338b54a3f541bec06ef9bf554aa7f83a (diff)
1 files changed, 61 insertions, 36 deletions
diff --git a/util/local_database/cldr2qlocalexml.py b/util/local_database/cldr2qlocalexml.py
index 8c730cdcde..69334b1dc3 100755
--- a/util/local_database/cldr2qlocalexml.py
+++ b/util/local_database/cldr2qlocalexml.py
@@ -438,6 +438,36 @@ def integrateWeekData(filePath):
         else:
             locale.weekendEnd = weekendEndByCountryCode["001"]
 
+def splitLocale(name):
+    """Split name into (language, script, territory) triple as generator.
+
+    Ignores any trailing fields, leaves script or territory empty if
+    unspecified, returns empty generator if no language found."""
+    tags = iter(name.split('_'))
+    yield tags.next() # Language
+    tag = tags.next()
+
+    # Script is always four letters, always capitalised:
+    if len(tag) == 4 and tag[0].isupper() and tag[1:].islower():
+        yield tag
+        try:
+            tag = tags.next()
+        except StopIteration:
+            tag = ''
+    else:
+        yield ''
+
+    # Territory is upper-case or numeric:
+    if tag and tag.isupper() or tag.isdigit():
+        yield tag
+        tag = ''
+    else:
+        yield ''
+
+    # If nothing is left, StopIteration will avoid the warning:
+    tag = (tag if tag else tags.next(),)
+    sys.stderr.write('Ignoring unparsed cruft %s in %s\n' % ('_'.join(tag + tuple(tags)), name))
+
 if len(sys.argv) != 2:
     usage()
 
@@ -451,30 +481,25 @@ cldr_files = os.listdir(cldr_dir)
 locale_database = {}
 
 # see http://www.unicode.org/reports/tr35/tr35-info.html#Default_Content
-defaultContent_locales = {}
+defaultContent_locales = []
 for ns in findTagsInFile(os.path.join(cldr_dir, '..', 'supplemental',
                                       'supplementalMetadata.xml'),
                          'metadata/defaultContent'):
     for data in ns[1:][0]:
         if data[0] == u"locales":
-            defaultContent_locales = data[1].split()
+            defaultContent_locales += data[1].split()
 
 for file in defaultContent_locales:
-    items = file.split("_")
-    if len(items) == 3:
-        language_code = items[0]
-        script_code = items[1]
-        country_code = items[2]
-    else:
-        if len(items) != 2:
-            sys.stderr.write('skipping defaultContent locale "' + file + '" [neither lang_script_country nor lang_country]\n')
-            continue
-        language_code = items[0]
-        script_code = ""
-        country_code = items[1]
-        if len(country_code) == 4:
-            sys.stderr.write('skipping defaultContent locale "' + file + '" [long country code]\n')
-            continue
+    try:
+        language_code, script_code, country_code = splitLocale(file)
+    except ValueError:
+        sys.stderr.write('skipping defaultContent locale "' + file + '" [neither two nor three tags]\n')
+        continue
+
+    if not (script_code or country_code):
+        sys.stderr.write('skipping defaultContent locale "' + file + '" [second tag is neither script nor territory]\n')
+        continue
+
     try:
         l = _generateLocaleInfo(cldr_dir + "/" + file + ".xml", language_code, script_code, country_code)
         if not l:
@@ -548,31 +573,31 @@ def _parseLocale(l):
     if l == "und":
         raise xpathlite.Error("we are treating unknown locale like C")
 
-    items = l.split("_")
-    language_code = items[0]
+    parsed = splitLocale(l)
+    language_code = parsed.next()
+    script_code = country_code = ''
+    try:
+        script_code, country_code = parsed
+    except ValueError:
+        pass
+
     if language_code != "und":
         language_id = enumdata.languageCodeToId(language_code)
         if language_id == -1:
             raise xpathlite.Error('unknown language code "%s"' % language_code)
         language = enumdata.language_list[language_id][0]
 
-    if len(items) > 1:
-        script_code = items[1]
-        country_code = ""
-        if len(items) > 2:
-            country_code = items[2]
-        if len(script_code) == 4:
-            script_id = enumdata.scriptCodeToId(script_code)
-            if script_id == -1:
-                raise xpathlite.Error('unknown script code "%s"' % script_code)
-            script = enumdata.script_list[script_id][0]
-        else:
-            country_code = script_code
-        if country_code:
-            country_id = enumdata.countryCodeToId(country_code)
-            if country_id == -1:
-                raise xpathlite.Error('unknown country code "%s"' % country_code)
-            country = enumdata.country_list[country_id][0]
+    if script_code:
+        script_id = enumdata.scriptCodeToId(script_code)
+        if script_id == -1:
+            raise xpathlite.Error('unknown script code "%s"' % script_code)
+        script = enumdata.script_list[script_id][0]
+
+    if country_code:
+        country_id = enumdata.countryCodeToId(country_code)
+        if country_id == -1:
+            raise xpathlite.Error('unknown country code "%s"' % country_code)
+        country = enumdata.country_list[country_id][0]
 
     return (language, script, country)
author	Edward Welbourne <edward.welbourne@qt.io>	2018-08-13 15:21:58 +0200
committer	Edward Welbourne <edward.welbourne@qt.io>	2018-08-22 18:00:14 +0000
commit	c1b15005c6594d0a0828190c858d9ab8ab3353b5 (patch)
tree	1a3443dea95e6f087a635b06913ef5412e5edd9a /util/local_database
parent	bbee6fac338b54a3f541bec06ef9bf554aa7f83a (diff)