From fe8962d3a506a850878401309f81247c80f3d203 Mon Sep 17 00:00:00 2001 From: Konstantin Ritt Date: Wed, 21 Nov 2012 06:08:24 +0200 Subject: Use likelySubtags to instantiate a locale id from it's short form ...just like described in http://www.unicode.org/reports/tr35/#Likely_Subtags. This is much more effective than current "guessing" algorithm + makes it possible to instantiate a locale by the script or territory code only. Change-Id: I674f8476e65b01c56960b6e83a1a346df0715274 Reviewed-by: Lars Knoll --- util/local_database/cldr2qlocalexml.py | 55 ++++++++++--------- util/local_database/qlocalexml2cpp.py | 97 +++++++++++++++++++++++++++++++--- 2 files changed, 120 insertions(+), 32 deletions(-) (limited to 'util') diff --git a/util/local_database/cldr2qlocalexml.py b/util/local_database/cldr2qlocalexml.py index 893b662f25..1604d0e14b 100755 --- a/util/local_database/cldr2qlocalexml.py +++ b/util/local_database/cldr2qlocalexml.py @@ -48,6 +48,7 @@ from xpathlite import DraftResolution from dateconverter import convert_date import re +findAlias = xpathlite.findAlias findEntry = xpathlite.findEntry findEntryInFile = xpathlite._findEntryInFile findTagsInFile = xpathlite.findTagsInFile @@ -116,6 +117,12 @@ def generateLocaleInfo(path): if not path.endswith(".xml"): return {} + + # skip legacy/compatibility ones + alias = findAlias(path) + if alias: + raise xpathlite.Error("alias to \"%s\"" % alias) + language_code = findEntryInFile(path, "identity/language", attribute="type")[0] if language_code == 'root': # just skip it @@ -128,18 +135,16 @@ def generateLocaleInfo(path): # ### actually there is only one locale with variant: en_US_POSIX # does anybody care about it at all? if variant_code: - return {} + raise xpathlite.Error("we do not support variants (\"%s\")" % variant_code) language_id = enumdata.languageCodeToId(language_code) if language_id <= 0: - sys.stderr.write("unknown language code \"" + language_code + "\"\n") - return {} + raise xpathlite.Error("unknown language code \"%s\"" % language_code) language = enumdata.language_list[language_id][0] script_id = enumdata.scriptCodeToId(script_code) if script_id == -1: - sys.stderr.write("unknown script code \"" + script_code + "\"\n") - return {} + raise xpathlite.Error("unknown script code \"%s\"" % script_code) script = enumdata.script_list[script_id][0] # we should handle fully qualified names with the territory @@ -147,8 +152,7 @@ def generateLocaleInfo(path): return {} country_id = enumdata.countryCodeToId(country_code) if country_id <= 0: - sys.stderr.write("unknown country code \"" + country_code + "\"\n") - return {} + raise xpathlite.Error("unknown country code \"%s\"" % country_code) country = enumdata.country_list[country_id][0] # So we say we accept only those values that have "contributed" or @@ -557,9 +561,13 @@ cldr_files = os.listdir(cldr_dir) locale_database = {} for file in cldr_files: - l = generateLocaleInfo(cldr_dir + "/" + file) - if not l: - sys.stderr.write("skipping file \"" + file + "\"\n") + try: + l = generateLocaleInfo(cldr_dir + "/" + file) + if not l: + sys.stderr.write("skipping file \"" + file + "\"\n") + continue + except xpathlite.Error as e: + sys.stderr.write("skipping file \"%s\" (%s)\n" % (file, str(e))) continue locale_database[(l['language_id'], l['script_id'], l['country_id'], l['variant_code'])] = l @@ -611,16 +619,15 @@ def _parseLocale(l): script = "AnyScript" country = "AnyCountry" - if l == "und": # we are treating unknown locale like C - return (None, None, None) + if l == "und": + raise xpathlite.Error("we are treating unknown locale like C") items = l.split("_") language_code = items[0] if language_code != "und": language_id = enumdata.languageCodeToId(language_code) if language_id == -1: - sys.stderr.write("unknown language code \"" + language_code + "\"\n") - return (None, None, None) + raise xpathlite.Error("unknown language code \"%s\"" % language_code) language = enumdata.language_list[language_id][0] if len(items) > 1: @@ -631,16 +638,14 @@ def _parseLocale(l): if len(script_code) == 4: script_id = enumdata.scriptCodeToId(script_code) if script_id == -1: - sys.stderr.write("unknown script code \"" + script_code + "\"\n") - return (None, None, None) + raise xpathlite.Error("unknown script code \"%s\"" % script_code) script = enumdata.script_list[script_id][0] else: country_code = script_code if country_code: country_id = enumdata.countryCodeToId(country_code) if country_id == -1: - sys.stderr.write("unknown country code \"" + country_code + "\"\n") - return (None, None, None) + raise xpathlite.Error("unknown country code \"%s\"" % country_code) country = enumdata.country_list[country_id][0] return (language, script, country) @@ -651,13 +656,15 @@ for ns in findTagsInFile(cldr_dir + "/../supplemental/likelySubtags.xml", "likel for data in ns[1:][0]: # ns looks like this: [u'likelySubtag', [(u'from', u'aa'), (u'to', u'aa_Latn_ET')]] tmp[data[0]] = data[1] - (from_language, from_script, from_country) = _parseLocale(tmp[u"from"]) - if not from_language: - sys.stderr.write("skipping likelySubtag " + tmp[u"from"] + " -> " + tmp[u"to"] + "\n") + try: + (from_language, from_script, from_country) = _parseLocale(tmp[u"from"]) + except xpathlite.Error as e: + sys.stderr.write("skipping likelySubtag \"%s\" -> \"%s\" (%s)\n" % (tmp[u"from"], tmp[u"to"], str(e))) continue - (to_language, to_script, to_country) = _parseLocale(tmp[u"to"]) - if not to_language: - sys.stderr.write("skipping likelySubtag " + tmp[u"from"] + " -> " + tmp[u"to"] + "\n") + try: + (to_language, to_script, to_country) = _parseLocale(tmp[u"to"]) + except xpathlite.Error as e: + sys.stderr.write("skipping likelySubtag \"%s\" -> \"%s\" (%s)\n" % (tmp[u"from"], tmp[u"to"], str(e))) continue # substitute according to http://www.unicode.org/reports/tr35/#Likely_Subtags if to_country == "AnyCountry" and from_country != to_country: diff --git a/util/local_database/qlocalexml2cpp.py b/util/local_database/qlocalexml2cpp.py index b82e1516b4..06fabcc77e 100755 --- a/util/local_database/qlocalexml2cpp.py +++ b/util/local_database/qlocalexml2cpp.py @@ -291,7 +291,7 @@ class Locale: self.currencyFormat = eltText(firstChildElt(elt, "currencyFormat")) self.currencyNegativeFormat = eltText(firstChildElt(elt, "currencyNegativeFormat")) -def loadLocaleMap(doc, language_map, script_map, country_map): +def loadLocaleMap(doc, language_map, script_map, country_map, likely_subtags_map): result = {} locale_list_elt = firstChildElt(doc.documentElement, "localeList") @@ -307,6 +307,28 @@ def loadLocaleMap(doc, language_map, script_map, country_map): country_id = countryNameToId(locale.country, country_map) if country_id == -1: sys.stderr.write("Cannot find a country id for '%s'\n" % locale.country) + + if language_id != 1: # C + if country_id == 0: + sys.stderr.write("loadLocaleMap: No country id for '%s'\n" % locale.language) + + if script_id == 0: + # find default script for a given language and country (see http://www.unicode.org/reports/tr35/#Likely_Subtags) + for key in likely_subtags_map.keys(): + tmp = likely_subtags_map[key] + if tmp["from"][0] == locale.language and tmp["from"][1] == "AnyScript" and tmp["from"][2] == locale.country: + locale.script = tmp["to"][1] + script_id = scriptNameToId(locale.script, script_map) + break + if script_id == 0 and country_id != 0: + # try with no country + for key in likely_subtags_map.keys(): + tmp = likely_subtags_map[key] + if tmp["from"][0] == locale.language and tmp["from"][1] == "AnyScript" and tmp["from"][2] == "AnyCountry": + locale.script = tmp["to"][1] + script_id = scriptNameToId(locale.script, script_map) + break + result[(language_id, script_id, country_id)] = locale locale_elt = nextSiblingElt(locale_elt, "locale") @@ -321,13 +343,21 @@ def compareLocaleKeys(key1, key2): l1 = compareLocaleKeys.locale_map[key1] l2 = compareLocaleKeys.locale_map[key2] - if l1.language in compareLocaleKeys.default_map: - default = compareLocaleKeys.default_map[l1.language] - if l1.country == default and key1[1] == 0: + if (l1.language, l1.script) in compareLocaleKeys.default_map.keys(): + default = compareLocaleKeys.default_map[(l1.language, l1.script)] + if l1.country == default: return -1 - if l2.country == default and key2[1] == 0: + if l2.country == default: return 1 + if key1[1] != key2[1]: + if (l2.language, l2.script) in compareLocaleKeys.default_map.keys(): + default = compareLocaleKeys.default_map[(l2.language, l2.script)] + if l2.country == default: + return 1 + if l1.country == default: + return -1 + if key1[1] != key2[1]: return key1[1] - key2[1] else: @@ -476,9 +506,9 @@ def main(): default_map = {} for key in likely_subtags_map.keys(): tmp = likely_subtags_map[key] - if tmp["from"][2] == "AnyCountry" and tmp["to"][2] != "AnyCountry" and tmp["from"][1] == "AnyScript": - default_map[tmp["to"][0]] = tmp["to"][2] - locale_map = loadLocaleMap(doc, language_map, script_map, country_map) + if tmp["from"][1] == "AnyScript" and tmp["from"][2] == "AnyCountry" and tmp["to"][2] != "AnyCountry": + default_map[(tmp["to"][0], tmp["to"][1])] = tmp["to"][2] + locale_map = loadLocaleMap(doc, language_map, script_map, country_map, likely_subtags_map) dupes = findDupes(language_map, country_map) cldr_version = eltText(firstChildElt(doc.documentElement, "version")) @@ -495,6 +525,57 @@ def main(): */\n\n\n\ " % (str(datetime.date.today()), cldr_version) ) + # Likely subtags map + data_temp_file.write("static const QLocaleId likely_subtags[] = {\n") + index = 0 + for key in likely_subtags_map.keys(): + tmp = likely_subtags_map[key] + from_language = languageNameToId(tmp["from"][0], language_map) + from_script = scriptNameToId(tmp["from"][1], script_map) + from_country = countryNameToId(tmp["from"][2], country_map) + to_language = languageNameToId(tmp["to"][0], language_map) + to_script = scriptNameToId(tmp["to"][1], script_map) + to_country = countryNameToId(tmp["to"][2], country_map) + + cmnt_from = "" + if from_language != 0: + cmnt_from = cmnt_from + language_map[from_language][1] + else: + cmnt_from = cmnt_from + "und" + if from_script != 0: + if cmnt_from: + cmnt_from = cmnt_from + "_" + cmnt_from = cmnt_from + script_map[from_script][1] + if from_country != 0: + if cmnt_from: + cmnt_from = cmnt_from + "_" + cmnt_from = cmnt_from + country_map[from_country][1] + cmnt_to = "" + if to_language != 0: + cmnt_to = cmnt_to + language_map[to_language][1] + else: + cmnt_from = cmnt_from + "und" + if to_script != 0: + if cmnt_to: + cmnt_to = cmnt_to + "_" + cmnt_to = cmnt_to + script_map[to_script][1] + if to_country != 0: + if cmnt_to: + cmnt_to = cmnt_to + "_" + cmnt_to = cmnt_to + country_map[to_country][1] + + data_temp_file.write(" ") + data_temp_file.write("{ %3d, %2d, %3d }, { %3d, %2d, %3d }" % (from_language, from_script, from_country, to_language, to_script, to_country)) + index += 1 + if index != len(likely_subtags_map): + data_temp_file.write(",") + else: + data_temp_file.write(" ") + data_temp_file.write(" // %s -> %s\n" % (cmnt_from, cmnt_to)) + data_temp_file.write("};\n") + + data_temp_file.write("\n") + # Locale index data_temp_file.write("static const quint16 locale_index[] = {\n") index = 0 -- cgit v1.2.3