From c1b15005c6594d0a0828190c858d9ab8ab3353b5 Mon Sep 17 00:00:00 2001
From: Edward Welbourne <edward.welbourne@qt.io>
Date: Mon, 13 Aug 2018 15:21:58 +0200
Subject: Fix handling of default content locale data
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

We iterate theoretically many sources (albeit there's only really ever
one) for this data, so accumulate instead of only keeping the last
(and initialize it as the list it always ended up being, not a
dictionary, so that this can work).

The form of each token is a locale name, so it may be lang_Script just
as readily as lang_LAND; so parse (and validate) the tags more
faithfully to catch both cases.  Abstract that parsing into a function
and use it in both places that need it.

Change-Id: Ibdbc4eafefab6a5ef70646d9fea150f2cb408d05
Reviewed-by: Jędrzej Nowacki <jedrzej.nowacki@qt.io>
---
 util/local_database/cldr2qlocalexml.py | 97 +++++++++++++++++++++-------------
 1 file changed, 61 insertions(+), 36 deletions(-)

(limited to 'util')

diff --git a/util/local_database/cldr2qlocalexml.py b/util/local_database/cldr2qlocalexml.py
index 8c730cdcde..69334b1dc3 100755
--- a/util/local_database/cldr2qlocalexml.py
+++ b/util/local_database/cldr2qlocalexml.py
@@ -438,6 +438,36 @@ def integrateWeekData(filePath):
         else:
             locale.weekendEnd = weekendEndByCountryCode["001"]
 
+def splitLocale(name):
+    """Split name into (language, script, territory) triple as generator.
+
+    Ignores any trailing fields, leaves script or territory empty if
+    unspecified, returns empty generator if no language found."""
+    tags = iter(name.split('_'))
+    yield tags.next() # Language
+    tag = tags.next()
+
+    # Script is always four letters, always capitalised:
+    if len(tag) == 4 and tag[0].isupper() and tag[1:].islower():
+        yield tag
+        try:
+            tag = tags.next()
+        except StopIteration:
+            tag = ''
+    else:
+        yield ''
+
+    # Territory is upper-case or numeric:
+    if tag and tag.isupper() or tag.isdigit():
+        yield tag
+        tag = ''
+    else:
+        yield ''
+
+    # If nothing is left, StopIteration will avoid the warning:
+    tag = (tag if tag else tags.next(),)
+    sys.stderr.write('Ignoring unparsed cruft %s in %s\n' % ('_'.join(tag + tuple(tags)), name))
+
 if len(sys.argv) != 2:
     usage()
 
@@ -451,30 +481,25 @@ cldr_files = os.listdir(cldr_dir)
 locale_database = {}
 
 # see http://www.unicode.org/reports/tr35/tr35-info.html#Default_Content
-defaultContent_locales = {}
+defaultContent_locales = []
 for ns in findTagsInFile(os.path.join(cldr_dir, '..', 'supplemental',
                                       'supplementalMetadata.xml'),
                          'metadata/defaultContent'):
     for data in ns[1:][0]:
         if data[0] == u"locales":
-            defaultContent_locales = data[1].split()
+            defaultContent_locales += data[1].split()
 
 for file in defaultContent_locales:
-    items = file.split("_")
-    if len(items) == 3:
-        language_code = items[0]
-        script_code = items[1]
-        country_code = items[2]
-    else:
-        if len(items) != 2:
-            sys.stderr.write('skipping defaultContent locale "' + file + '" [neither lang_script_country nor lang_country]\n')
-            continue
-        language_code = items[0]
-        script_code = ""
-        country_code = items[1]
-        if len(country_code) == 4:
-            sys.stderr.write('skipping defaultContent locale "' + file + '" [long country code]\n')
-            continue
+    try:
+        language_code, script_code, country_code = splitLocale(file)
+    except ValueError:
+        sys.stderr.write('skipping defaultContent locale "' + file + '" [neither two nor three tags]\n')
+        continue
+
+    if not (script_code or country_code):
+        sys.stderr.write('skipping defaultContent locale "' + file + '" [second tag is neither script nor territory]\n')
+        continue
+
     try:
         l = _generateLocaleInfo(cldr_dir + "/" + file + ".xml", language_code, script_code, country_code)
         if not l:
@@ -548,31 +573,31 @@ def _parseLocale(l):
     if l == "und":
         raise xpathlite.Error("we are treating unknown locale like C")
 
-    items = l.split("_")
-    language_code = items[0]
+    parsed = splitLocale(l)
+    language_code = parsed.next()
+    script_code = country_code = ''
+    try:
+        script_code, country_code = parsed
+    except ValueError:
+        pass
+
     if language_code != "und":
         language_id = enumdata.languageCodeToId(language_code)
         if language_id == -1:
             raise xpathlite.Error('unknown language code "%s"' % language_code)
         language = enumdata.language_list[language_id][0]
 
-    if len(items) > 1:
-        script_code = items[1]
-        country_code = ""
-        if len(items) > 2:
-            country_code = items[2]
-        if len(script_code) == 4:
-            script_id = enumdata.scriptCodeToId(script_code)
-            if script_id == -1:
-                raise xpathlite.Error('unknown script code "%s"' % script_code)
-            script = enumdata.script_list[script_id][0]
-        else:
-            country_code = script_code
-        if country_code:
-            country_id = enumdata.countryCodeToId(country_code)
-            if country_id == -1:
-                raise xpathlite.Error('unknown country code "%s"' % country_code)
-            country = enumdata.country_list[country_id][0]
+    if script_code:
+        script_id = enumdata.scriptCodeToId(script_code)
+        if script_id == -1:
+            raise xpathlite.Error('unknown script code "%s"' % script_code)
+        script = enumdata.script_list[script_id][0]
+
+    if country_code:
+        country_id = enumdata.countryCodeToId(country_code)
+        if country_id == -1:
+            raise xpathlite.Error('unknown country code "%s"' % country_code)
+        country = enumdata.country_list[country_id][0]
 
     return (language, script, country)
 
-- 
cgit v1.2.3