diff options
Diffstat (limited to 'util/locale_database/qlocalexml2cpp.py')
-rwxr-xr-x | util/locale_database/qlocalexml2cpp.py | 227 |
1 files changed, 204 insertions, 23 deletions
diff --git a/util/locale_database/qlocalexml2cpp.py b/util/locale_database/qlocalexml2cpp.py index a884459ae3..5bc9dd92f2 100755 --- a/util/locale_database/qlocalexml2cpp.py +++ b/util/locale_database/qlocalexml2cpp.py @@ -22,6 +22,23 @@ from typing import Optional from qlocalexml import QLocaleXmlReader from localetools import * from iso639_3 import LanguageCodeData +from zonedata import utcIdList, windowsIdList + + +# Sanity check the zone data: + +# Offsets of the windows tables, in minutes, where whole numbers: +winOff = set(m for m, s in (divmod(v, 60) for k, v in windowsIdList) if s == 0) +# The UTC±HH:mm forms of the non-zero offsets: +winUtc = set(f'UTC-{h:02}:{m:02}' + for h, m in (divmod(-o, 60) for o in winOff if o < 0) + ).union(f'UTC+{h:02}:{m:02}' + for h, m in (divmod(o, 60) for o in winOff if o > 0)) +# All such offsets should be represented by entries in utcIdList: +newUtc = winUtc.difference(utcIdList) +assert not newUtc, ( + 'Please add missing UTC-offset zones to to zonedata.utcIdList', newUtc) + class LocaleKeySorter: """Sort-ordering representation of a locale key. @@ -47,39 +64,64 @@ class LocaleKeySorter: # TODO: should we compare territory before or after script ? return (key[0], self.foreign(key)) + key[1:] -class StringDataToken: - def __init__(self, index, length, bits): +class ByteArrayData: + # Only for use with ASCII data, e.g. IANA IDs. + def __init__(self): + self.data, self.hash = [], {} + + def append(self, s): + assert s.isascii(), s + s += '\0' + if s in self.hash: + return self.hash[s] + + index = len(self.data) if index > 0xffff: - raise ValueError(f'Start-index ({index}) exceeds the uint16 range!') - if length >= (1 << bits): - raise ValueError(f'Data size ({length}) exceeds the {bits}-bit range!') + raise Error(f'Index ({index}) outside the uint16 range !') + self.hash[s] = index + self.data += unicode2hex(s) + return index + + def write(self, out, name): + out(f'\nstatic constexpr char {name}[] = {{\n') + out(wrap_list(self.data, 16)) # 16 == 100 // len('0xhh, ') + # All data is ASCII, so only two-digit hex is ever needed. + out('\n};\n') + +class StringDataToken: + def __init__(self, index, length, lenbits, indbits): + if index >= (1 << indbits): + raise ValueError(f'Start-index ({index}) exceeds the {indbits}-bit range!') + if length >= (1 << lenbits): + raise ValueError(f'Data size ({length}) exceeds the {lenbits}-bit range!') self.index = index self.length = length class StringData: - def __init__(self, name): + def __init__(self, name, lenbits = 8, indbits = 16): self.data = [] self.hash = {} self.name = name self.text = '' # Used in quick-search for matches in data + self.__bits = lenbits, indbits - def append(self, s, bits = 8): + def append(self, s): try: token = self.hash[s] except KeyError: - token = self.__store(s, bits) + token = self.__store(s) self.hash[s] = token return token - def __store(self, s, bits): + def __store(self, s): """Add string s to known data. Seeks to avoid duplication, where possible. For example, short-forms may be prefixes of long-forms. """ if not s: - return StringDataToken(0, 0, bits) + return StringDataToken(0, 0, *self.__bits) ucs2 = unicode2hex(s) try: index = self.text.index(s) - 1 @@ -97,14 +139,16 @@ class StringData: assert index >= 0 try: - return StringDataToken(index, len(ucs2), bits) + return StringDataToken(index, len(ucs2), *self.__bits) except ValueError as e: e.args += (self.name, s) raise def write(self, fd): - if len(self.data) > 0xffff: - raise ValueError(f'Data is too big ({len(self.data)}) for quint16 index to its end!', + indbits = self.__bits[1] + if len(self.data) >= (1 << indbits): + raise ValueError(f'Data is too big ({len(self.data)}) ' + f'for {indbits}-bit index to its end!', self.name) fd.write(f"\nstatic constexpr char16_t {self.name}[] = {{\n") fd.write(wrap_list(self.data, 12)) # 12 == 100 // len('0xhhhh, ') @@ -136,6 +180,92 @@ class LocaleSourceEditor (SourceFileEditor): """) +class TimeZoneDataWriter (LocaleSourceEditor): + def __init__(self, path: Path, temp: Path, version: str): + super().__init__(path, temp, version) + self.__ianaTable = ByteArrayData() # Single IANA IDs + self.__ianaListTable = ByteArrayData() # Space-joined lists of IDs + self.__windowsTable = ByteArrayData() # Windows names for zones + self.__windowsList = sorted(windowsIdList, + key=lambda p: p[0].lower()) + self.windowsKey = {name: (key, off) for key, (name, off) + in enumerate(self.__windowsList, 1)} + + def utcTable(self): + offsetMap, out = {}, self.writer.write + for name in utcIdList: + offset = self.__offsetOf(name) + offsetMap[offset] = offsetMap.get(offset, ()) + (name,) + + # Write UTC ID key table + out('// IANA ID Index, UTC Offset\n') + out('static constexpr UtcData utcDataTable[] = {\n') + for offset in sorted(offsetMap.keys()): # Sort so C++ can binary-chop. + names = offsetMap[offset]; + joined = self.__ianaListTable.append(' '.join(names)) + out(f' {{ {joined:6d},{offset:6d} }}, // {names[0]}\n') + out('};\n') + + def aliasToIana(self, pairs): + out, store = self.writer.write, self.__ianaTable.append + + out('// Alias ID Index, Alias ID Index\n') + out('static constexpr AliasData aliasMappingTable[] = {\n') + for name, iana in pairs: # They're ready-sorted + assert name != iana, (alias, iana) # Filtered out in QLocaleXmlWriter + out(f' {{ {store(name):6d},{store(iana):6d} }},' + f' // {name} -> {iana}\n') + out('};\n\n') + + def msToIana(self, pairs): + out, winStore = self.writer.write, self.__windowsTable.append + ianaStore = self.__ianaListTable.append # TODO: Should be __ianaTable + alias = dict(pairs) # {MS name: IANA ID} + + out('// Windows ID Key, Windows ID Index, IANA ID Index, UTC Offset\n') + out('static constexpr WindowsData windowsDataTable[] = {\n') + # Sorted by Windows ID key: + + for index, (name, offset) in enumerate(self.__windowsList, 1): + out(f' {{ {index:6d},{winStore(name):6d},' + f'{ianaStore(alias[name]):6d},{offset:6d} }}, // {name}\n') + out('};\n\n') + + def msLandIanas(self, triples): # (MS name, territory code, IANA list) + out, store = self.writer.write, self.__ianaListTable.append + from enumdata import territory_map + landKey = {code: (i, name) for i, (name, code) in territory_map.items()} + seq = sorted((self.windowsKey[name][0], landKey[land][0], name, landKey[land][1], ianas) + for name, land, ianas in triples) + + out('// Windows ID Key, Territory Enum, IANA ID Index\n') + out('static constexpr ZoneData zoneDataTable[] = {\n') + # Sorted by (Windows ID Key, territory enum) + for winId, landId, name, land, ianas in seq: + out(f' {{ {winId:6d},{landId:6d},{store(ianas):6d} }},' + f' // {name} / {land}\n') + out('};\n\n') + + def writeTables(self): + self.__windowsTable.write(self.writer.write, 'windowsIdData') + # TODO: these are misnamed, entries in the first are lists, + # those in the next are single IANA IDs + self.__ianaListTable.write(self.writer.write, 'ianaIdData') + self.__ianaTable.write(self.writer.write, 'aliasIdData') + + # Implementation details: + @staticmethod + def __offsetOf(utcName): + "Maps a UTC±HH:mm name to its offset in seconds" + assert utcName.startswith('UTC') + if len(utcName) == 3: + return 0 + assert utcName[3] in '+-', utcName + sign = -1 if utcName[3] == '-' else 1 + assert len(utcName) == 9 and utcName[6] == ':', utcName + hour, mins = int(utcName[4:6]), int(utcName[-2:]) + return sign * (hour * 60 + mins) * 60 + class LocaleDataWriter (LocaleSourceEditor): def likelySubtags(self, likely): # First sort likely, so that we can use binary search in C++ @@ -414,7 +544,7 @@ class CalendarDataWriter (LocaleSourceEditor): + ','.join(('{:6d}',) * 3 + ('{:5d}',) * 6 + ('{:3d}',) * 6) + ' }},').format def write(self, calendar, locales, names): - months_data = StringData('months_data') + months_data = StringData('months_data', 16) self.writer.write('static constexpr QCalendarLocale locale_data[] = {\n') self.writer.write( @@ -438,11 +568,10 @@ class CalendarDataWriter (LocaleSourceEditor): # Sequence of StringDataToken: try: # Twelve long month names can add up to more than 256 (e.g. kde_TZ: 264) - ranges = (tuple(months_data.append(m[calendar], 16) for m in - (locale.standaloneLongMonths, locale.longMonths)) + - tuple(months_data.append(m[calendar]) for m in - (locale.standaloneShortMonths, locale.shortMonths, - locale.standaloneNarrowMonths, locale.narrowMonths))) + ranges = tuple(months_data.append(m[calendar]) for m in + (locale.standaloneLongMonths, locale.longMonths, + locale.standaloneShortMonths, locale.shortMonths, + locale.standaloneNarrowMonths, locale.narrowMonths)) except ValueError as e: e.args += (locale.language, locale.script, locale.territory) raise @@ -458,6 +587,23 @@ class CalendarDataWriter (LocaleSourceEditor): self.writer.write('};\n') months_data.write(self.writer) + +class TestLocaleWriter (LocaleSourceEditor): + def localeList(self, locales): + self.writer.write('const LocaleListItem g_locale_list[] = {\n') + from enumdata import language_map, territory_map + # TODO: update testlocales/ to include script. + # For now, only mention each (lang, land) pair once: + pairs = set((lang, land) for lang, script, land in locales) + for lang, script, land in locales: + if (lang, land) in pairs: + pairs.discard((lang, land)) + langName = language_map[lang][0] + landName = territory_map[land][0] + self.writer.write(f' {{ {lang:6d},{land:6d} }}, // {langName}/{landName}\n') + self.writer.write('};\n\n') + + class LocaleHeaderWriter (SourceFileEditor): def __init__(self, path, temp, enumify): super().__init__(path, temp) @@ -504,17 +650,29 @@ class LocaleHeaderWriter (SourceFileEditor): out('\n };\n') -def main(out, err): +def main(argv, out, err): + """Updates QLocale's CLDR data from a QLocaleXML file. + + Takes sys.argv, sys.stdout, sys.stderr (or equivalents) as + arguments. In argv[1:] it expects the QLocaleXML file as first + parameter and the ISO 639-3 data table as second + parameter. Accepts the root of the qtbase checkout as third + parameter (default is inferred from this script's path) and a + --calendars option to select which calendars to support (all + available by default). + + Updates various src/corelib/t*/q*_data_p.h files within the qtbase + checkout to contain data extracted from the QLocaleXML file.""" calendars_map = { # CLDR name: Qt file name fragment 'gregorian': 'roman', 'persian': 'jalali', 'islamic': 'hijri', - # 'hebrew': 'hebrew' } all_calendars = list(calendars_map.keys()) parser = argparse.ArgumentParser( + prog=Path(argv[0]).name, description='Generate C++ code from CLDR data in QLocaleXML form.', formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('input_file', help='input XML file name', @@ -526,7 +684,7 @@ def main(out, err): parser.add_argument('--calendars', help='select calendars to emit data for', nargs='+', metavar='CALENDAR', choices=all_calendars, default=all_calendars) - args = parser.parse_args() + args = parser.parse_args(argv[1:]) qlocalexml = args.input_file qtsrcdir = Path(args.qtbase_path) @@ -594,8 +752,31 @@ def main(out, err): err.write(f'\nError updating qlocale.h: {e}\n') return 1 + # Locale-independent timezone data + try: + with TimeZoneDataWriter(qtsrcdir.joinpath( + 'src/corelib/time/qtimezoneprivate_data_p.h'), + qtsrcdir, reader.cldrVersion) as writer: + writer.aliasToIana(reader.aliasToIana()) + writer.msLandIanas(reader.msLandIanas()) + writer.msToIana(reader.msToIana()) + writer.utcTable() + writer.writeTables() + except Exception as e: + err.write(f'\nError updating qtimezoneprivate_data_p.h: {e}\n') + return 1 + + # ./testlocales/localemodel.cpp + try: + path = 'util/locale_database/testlocales/localemodel.cpp' + with TestLocaleWriter(qtsrcdir.joinpath(path), qtsrcdir, + reader.cldrVersion) as test: + test.localeList(locale_keys) + except Exception as e: + err.write(f'\nError updating localemodel.cpp: {e}\n') + return 0 if __name__ == "__main__": import sys - sys.exit(main(sys.stdout, sys.stderr)) + sys.exit(main(sys.argv, sys.stdout, sys.stderr)) |