summaryrefslogtreecommitdiffstats
path: root/util/locale_database/qlocalexml2cpp.py
diff options
context:
space:
mode:
Diffstat (limited to 'util/locale_database/qlocalexml2cpp.py')
-rwxr-xr-xutil/locale_database/qlocalexml2cpp.py227
1 files changed, 204 insertions, 23 deletions
diff --git a/util/locale_database/qlocalexml2cpp.py b/util/locale_database/qlocalexml2cpp.py
index a884459ae3..5bc9dd92f2 100755
--- a/util/locale_database/qlocalexml2cpp.py
+++ b/util/locale_database/qlocalexml2cpp.py
@@ -22,6 +22,23 @@ from typing import Optional
from qlocalexml import QLocaleXmlReader
from localetools import *
from iso639_3 import LanguageCodeData
+from zonedata import utcIdList, windowsIdList
+
+
+# Sanity check the zone data:
+
+# Offsets of the windows tables, in minutes, where whole numbers:
+winOff = set(m for m, s in (divmod(v, 60) for k, v in windowsIdList) if s == 0)
+# The UTC±HH:mm forms of the non-zero offsets:
+winUtc = set(f'UTC-{h:02}:{m:02}'
+ for h, m in (divmod(-o, 60) for o in winOff if o < 0)
+ ).union(f'UTC+{h:02}:{m:02}'
+ for h, m in (divmod(o, 60) for o in winOff if o > 0))
+# All such offsets should be represented by entries in utcIdList:
+newUtc = winUtc.difference(utcIdList)
+assert not newUtc, (
+ 'Please add missing UTC-offset zones to to zonedata.utcIdList', newUtc)
+
class LocaleKeySorter:
"""Sort-ordering representation of a locale key.
@@ -47,39 +64,64 @@ class LocaleKeySorter:
# TODO: should we compare territory before or after script ?
return (key[0], self.foreign(key)) + key[1:]
-class StringDataToken:
- def __init__(self, index, length, bits):
+class ByteArrayData:
+ # Only for use with ASCII data, e.g. IANA IDs.
+ def __init__(self):
+ self.data, self.hash = [], {}
+
+ def append(self, s):
+ assert s.isascii(), s
+ s += '\0'
+ if s in self.hash:
+ return self.hash[s]
+
+ index = len(self.data)
if index > 0xffff:
- raise ValueError(f'Start-index ({index}) exceeds the uint16 range!')
- if length >= (1 << bits):
- raise ValueError(f'Data size ({length}) exceeds the {bits}-bit range!')
+ raise Error(f'Index ({index}) outside the uint16 range !')
+ self.hash[s] = index
+ self.data += unicode2hex(s)
+ return index
+
+ def write(self, out, name):
+ out(f'\nstatic constexpr char {name}[] = {{\n')
+ out(wrap_list(self.data, 16)) # 16 == 100 // len('0xhh, ')
+ # All data is ASCII, so only two-digit hex is ever needed.
+ out('\n};\n')
+
+class StringDataToken:
+ def __init__(self, index, length, lenbits, indbits):
+ if index >= (1 << indbits):
+ raise ValueError(f'Start-index ({index}) exceeds the {indbits}-bit range!')
+ if length >= (1 << lenbits):
+ raise ValueError(f'Data size ({length}) exceeds the {lenbits}-bit range!')
self.index = index
self.length = length
class StringData:
- def __init__(self, name):
+ def __init__(self, name, lenbits = 8, indbits = 16):
self.data = []
self.hash = {}
self.name = name
self.text = '' # Used in quick-search for matches in data
+ self.__bits = lenbits, indbits
- def append(self, s, bits = 8):
+ def append(self, s):
try:
token = self.hash[s]
except KeyError:
- token = self.__store(s, bits)
+ token = self.__store(s)
self.hash[s] = token
return token
- def __store(self, s, bits):
+ def __store(self, s):
"""Add string s to known data.
Seeks to avoid duplication, where possible.
For example, short-forms may be prefixes of long-forms.
"""
if not s:
- return StringDataToken(0, 0, bits)
+ return StringDataToken(0, 0, *self.__bits)
ucs2 = unicode2hex(s)
try:
index = self.text.index(s) - 1
@@ -97,14 +139,16 @@ class StringData:
assert index >= 0
try:
- return StringDataToken(index, len(ucs2), bits)
+ return StringDataToken(index, len(ucs2), *self.__bits)
except ValueError as e:
e.args += (self.name, s)
raise
def write(self, fd):
- if len(self.data) > 0xffff:
- raise ValueError(f'Data is too big ({len(self.data)}) for quint16 index to its end!',
+ indbits = self.__bits[1]
+ if len(self.data) >= (1 << indbits):
+ raise ValueError(f'Data is too big ({len(self.data)}) '
+ f'for {indbits}-bit index to its end!',
self.name)
fd.write(f"\nstatic constexpr char16_t {self.name}[] = {{\n")
fd.write(wrap_list(self.data, 12)) # 12 == 100 // len('0xhhhh, ')
@@ -136,6 +180,92 @@ class LocaleSourceEditor (SourceFileEditor):
""")
+class TimeZoneDataWriter (LocaleSourceEditor):
+ def __init__(self, path: Path, temp: Path, version: str):
+ super().__init__(path, temp, version)
+ self.__ianaTable = ByteArrayData() # Single IANA IDs
+ self.__ianaListTable = ByteArrayData() # Space-joined lists of IDs
+ self.__windowsTable = ByteArrayData() # Windows names for zones
+ self.__windowsList = sorted(windowsIdList,
+ key=lambda p: p[0].lower())
+ self.windowsKey = {name: (key, off) for key, (name, off)
+ in enumerate(self.__windowsList, 1)}
+
+ def utcTable(self):
+ offsetMap, out = {}, self.writer.write
+ for name in utcIdList:
+ offset = self.__offsetOf(name)
+ offsetMap[offset] = offsetMap.get(offset, ()) + (name,)
+
+ # Write UTC ID key table
+ out('// IANA ID Index, UTC Offset\n')
+ out('static constexpr UtcData utcDataTable[] = {\n')
+ for offset in sorted(offsetMap.keys()): # Sort so C++ can binary-chop.
+ names = offsetMap[offset];
+ joined = self.__ianaListTable.append(' '.join(names))
+ out(f' {{ {joined:6d},{offset:6d} }}, // {names[0]}\n')
+ out('};\n')
+
+ def aliasToIana(self, pairs):
+ out, store = self.writer.write, self.__ianaTable.append
+
+ out('// Alias ID Index, Alias ID Index\n')
+ out('static constexpr AliasData aliasMappingTable[] = {\n')
+ for name, iana in pairs: # They're ready-sorted
+ assert name != iana, (alias, iana) # Filtered out in QLocaleXmlWriter
+ out(f' {{ {store(name):6d},{store(iana):6d} }},'
+ f' // {name} -> {iana}\n')
+ out('};\n\n')
+
+ def msToIana(self, pairs):
+ out, winStore = self.writer.write, self.__windowsTable.append
+ ianaStore = self.__ianaListTable.append # TODO: Should be __ianaTable
+ alias = dict(pairs) # {MS name: IANA ID}
+
+ out('// Windows ID Key, Windows ID Index, IANA ID Index, UTC Offset\n')
+ out('static constexpr WindowsData windowsDataTable[] = {\n')
+ # Sorted by Windows ID key:
+
+ for index, (name, offset) in enumerate(self.__windowsList, 1):
+ out(f' {{ {index:6d},{winStore(name):6d},'
+ f'{ianaStore(alias[name]):6d},{offset:6d} }}, // {name}\n')
+ out('};\n\n')
+
+ def msLandIanas(self, triples): # (MS name, territory code, IANA list)
+ out, store = self.writer.write, self.__ianaListTable.append
+ from enumdata import territory_map
+ landKey = {code: (i, name) for i, (name, code) in territory_map.items()}
+ seq = sorted((self.windowsKey[name][0], landKey[land][0], name, landKey[land][1], ianas)
+ for name, land, ianas in triples)
+
+ out('// Windows ID Key, Territory Enum, IANA ID Index\n')
+ out('static constexpr ZoneData zoneDataTable[] = {\n')
+ # Sorted by (Windows ID Key, territory enum)
+ for winId, landId, name, land, ianas in seq:
+ out(f' {{ {winId:6d},{landId:6d},{store(ianas):6d} }},'
+ f' // {name} / {land}\n')
+ out('};\n\n')
+
+ def writeTables(self):
+ self.__windowsTable.write(self.writer.write, 'windowsIdData')
+ # TODO: these are misnamed, entries in the first are lists,
+ # those in the next are single IANA IDs
+ self.__ianaListTable.write(self.writer.write, 'ianaIdData')
+ self.__ianaTable.write(self.writer.write, 'aliasIdData')
+
+ # Implementation details:
+ @staticmethod
+ def __offsetOf(utcName):
+ "Maps a UTC±HH:mm name to its offset in seconds"
+ assert utcName.startswith('UTC')
+ if len(utcName) == 3:
+ return 0
+ assert utcName[3] in '+-', utcName
+ sign = -1 if utcName[3] == '-' else 1
+ assert len(utcName) == 9 and utcName[6] == ':', utcName
+ hour, mins = int(utcName[4:6]), int(utcName[-2:])
+ return sign * (hour * 60 + mins) * 60
+
class LocaleDataWriter (LocaleSourceEditor):
def likelySubtags(self, likely):
# First sort likely, so that we can use binary search in C++
@@ -414,7 +544,7 @@ class CalendarDataWriter (LocaleSourceEditor):
+ ','.join(('{:6d}',) * 3 + ('{:5d}',) * 6 + ('{:3d}',) * 6)
+ ' }},').format
def write(self, calendar, locales, names):
- months_data = StringData('months_data')
+ months_data = StringData('months_data', 16)
self.writer.write('static constexpr QCalendarLocale locale_data[] = {\n')
self.writer.write(
@@ -438,11 +568,10 @@ class CalendarDataWriter (LocaleSourceEditor):
# Sequence of StringDataToken:
try:
# Twelve long month names can add up to more than 256 (e.g. kde_TZ: 264)
- ranges = (tuple(months_data.append(m[calendar], 16) for m in
- (locale.standaloneLongMonths, locale.longMonths)) +
- tuple(months_data.append(m[calendar]) for m in
- (locale.standaloneShortMonths, locale.shortMonths,
- locale.standaloneNarrowMonths, locale.narrowMonths)))
+ ranges = tuple(months_data.append(m[calendar]) for m in
+ (locale.standaloneLongMonths, locale.longMonths,
+ locale.standaloneShortMonths, locale.shortMonths,
+ locale.standaloneNarrowMonths, locale.narrowMonths))
except ValueError as e:
e.args += (locale.language, locale.script, locale.territory)
raise
@@ -458,6 +587,23 @@ class CalendarDataWriter (LocaleSourceEditor):
self.writer.write('};\n')
months_data.write(self.writer)
+
+class TestLocaleWriter (LocaleSourceEditor):
+ def localeList(self, locales):
+ self.writer.write('const LocaleListItem g_locale_list[] = {\n')
+ from enumdata import language_map, territory_map
+ # TODO: update testlocales/ to include script.
+ # For now, only mention each (lang, land) pair once:
+ pairs = set((lang, land) for lang, script, land in locales)
+ for lang, script, land in locales:
+ if (lang, land) in pairs:
+ pairs.discard((lang, land))
+ langName = language_map[lang][0]
+ landName = territory_map[land][0]
+ self.writer.write(f' {{ {lang:6d},{land:6d} }}, // {langName}/{landName}\n')
+ self.writer.write('};\n\n')
+
+
class LocaleHeaderWriter (SourceFileEditor):
def __init__(self, path, temp, enumify):
super().__init__(path, temp)
@@ -504,17 +650,29 @@ class LocaleHeaderWriter (SourceFileEditor):
out('\n };\n')
-def main(out, err):
+def main(argv, out, err):
+ """Updates QLocale's CLDR data from a QLocaleXML file.
+
+ Takes sys.argv, sys.stdout, sys.stderr (or equivalents) as
+ arguments. In argv[1:] it expects the QLocaleXML file as first
+ parameter and the ISO 639-3 data table as second
+ parameter. Accepts the root of the qtbase checkout as third
+ parameter (default is inferred from this script's path) and a
+ --calendars option to select which calendars to support (all
+ available by default).
+
+ Updates various src/corelib/t*/q*_data_p.h files within the qtbase
+ checkout to contain data extracted from the QLocaleXML file."""
calendars_map = {
# CLDR name: Qt file name fragment
'gregorian': 'roman',
'persian': 'jalali',
'islamic': 'hijri',
- # 'hebrew': 'hebrew'
}
all_calendars = list(calendars_map.keys())
parser = argparse.ArgumentParser(
+ prog=Path(argv[0]).name,
description='Generate C++ code from CLDR data in QLocaleXML form.',
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('input_file', help='input XML file name',
@@ -526,7 +684,7 @@ def main(out, err):
parser.add_argument('--calendars', help='select calendars to emit data for',
nargs='+', metavar='CALENDAR',
choices=all_calendars, default=all_calendars)
- args = parser.parse_args()
+ args = parser.parse_args(argv[1:])
qlocalexml = args.input_file
qtsrcdir = Path(args.qtbase_path)
@@ -594,8 +752,31 @@ def main(out, err):
err.write(f'\nError updating qlocale.h: {e}\n')
return 1
+ # Locale-independent timezone data
+ try:
+ with TimeZoneDataWriter(qtsrcdir.joinpath(
+ 'src/corelib/time/qtimezoneprivate_data_p.h'),
+ qtsrcdir, reader.cldrVersion) as writer:
+ writer.aliasToIana(reader.aliasToIana())
+ writer.msLandIanas(reader.msLandIanas())
+ writer.msToIana(reader.msToIana())
+ writer.utcTable()
+ writer.writeTables()
+ except Exception as e:
+ err.write(f'\nError updating qtimezoneprivate_data_p.h: {e}\n')
+ return 1
+
+ # ./testlocales/localemodel.cpp
+ try:
+ path = 'util/locale_database/testlocales/localemodel.cpp'
+ with TestLocaleWriter(qtsrcdir.joinpath(path), qtsrcdir,
+ reader.cldrVersion) as test:
+ test.localeList(locale_keys)
+ except Exception as e:
+ err.write(f'\nError updating localemodel.cpp: {e}\n')
+
return 0
if __name__ == "__main__":
import sys
- sys.exit(main(sys.stdout, sys.stderr))
+ sys.exit(main(sys.argv, sys.stdout, sys.stderr))