From 47d94dab0fbc428e5c3401c411ac01a05e5e851f Mon Sep 17 00:00:00 2001 From: Edward Welbourne Date: Thu, 9 Jan 2020 20:36:58 +0100 Subject: Minor tidy-up in qlocalexml2cpp.py Split a long line. Use pythonic chained comparison to save some repetition. Comment on a field not currently in actual use. Say "zeros" rather than "0s" in one comment to match another. Added a .h suffix to the main locale data tempfile to match the naming of the tempfiles used for calendar data. Simplify generation of the blank line between Language and Script; and include a matching blank between Script and Country. This adds one blank line to qlocale.h Removed a stray space that misaligned locale data lines. This produces a space-only change in the generated *_data_p.h files. Change-Id: I974a9e8923c3dfd2178855d2cf1d6a5074e130b3 Reviewed-by: Lars Knoll --- util/locale_database/qlocalexml2cpp.py | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) (limited to 'util/locale_database/qlocalexml2cpp.py') diff --git a/util/locale_database/qlocalexml2cpp.py b/util/locale_database/qlocalexml2cpp.py index 52e6331569..bf58683637 100755 --- a/util/locale_database/qlocalexml2cpp.py +++ b/util/locale_database/qlocalexml2cpp.py @@ -319,7 +319,7 @@ def escapedString(s): need_escape = False result = "" for c in s: - if ord(c) < 128 and (not need_escape or ord(c.lower()) < ord('a') or ord(c.lower()) > ord('f')): + if ord(c) < 128 and not (need_escape and ord('a') <= ord(c.lower()) <= ord('f')): line += c need_escape = False else: @@ -361,7 +361,7 @@ def main(): for leaf in ('qlocale_data_p.h', 'qlocale.h', 'qlocale.qdoc'))): usage() - (data_temp_file, data_temp_file_path) = tempfile.mkstemp("qlocale_data_p", dir=qtsrcdir) + (data_temp_file, data_temp_file_path) = tempfile.mkstemp("qlocale_data_p.h", dir=qtsrcdir) data_temp_file = os.fdopen(data_temp_file, "w") qlocaledata_file = open(qtsrcdir + "/src/corelib/text/qlocale_data_p.h", "r") s = qlocaledata_file.readline() @@ -426,7 +426,8 @@ def main(): cmnt_to = cmnt_to + country_map[to_country][1] data_temp_file.write(" ") - data_temp_file.write("{ %3d, %3d, %3d }, { %3d, %3d, %3d }" % (from_language, from_script, from_country, to_language, to_script, to_country)) + data_temp_file.write("{ %3d, %3d, %3d }, { %3d, %3d, %3d }" % + (from_language, from_script, from_country, to_language, to_script, to_country)) index += 1 if index != len(likely_subtags_map): data_temp_file.write(",") @@ -591,7 +592,7 @@ def main(): endonyms_data.append(l.languageEndonym), endonyms_data.append(l.countryEndonym), l.currencyDigits, - l.currencyRounding, + l.currencyRounding, # unused (QTBUG-81343) l.firstDayOfWeek, l.weekendStart, l.weekendEnd) @@ -600,7 +601,7 @@ def main(): % ( (0,) * (3 + 8 + 4) + ("0,0",) * (16 + 3) + (currencyIsoCodeData(0),) + ("0,0",) * 6 + (0,) * (2 + 3)) - + " // trailing 0s\n") + + " // trailing zeros\n") data_temp_file.write("};\n") # StringData tables: @@ -776,7 +777,7 @@ def main(): months_data.append(l.shortMonths[calendar]), months_data.append(l.longMonths[calendar]), months_data.append(l.narrowMonths[calendar])) - + "// %s/%s/%s\n " % (l.language, l.script, l.country)) + + "// %s/%s/%s\n" % (l.language, l.script, l.country)) calendar_temp_file.write(calendar_format % ( (0,) * 3 + ('0,0',) * 6 ) + '// trailing zeros\n') calendar_temp_file.write("};\n") @@ -815,9 +816,7 @@ def main(): ",\n") qlocaleh_temp_file.write("\n") qlocaleh_temp_file.write(" LastLanguage = " + language + "\n") - qlocaleh_temp_file.write(" };\n") - - qlocaleh_temp_file.write("\n") + qlocaleh_temp_file.write(" };\n\n") # Script enum qlocaleh_temp_file.write(" enum Script {\n") @@ -831,7 +830,7 @@ def main(): ",\n") qlocaleh_temp_file.write("\n") qlocaleh_temp_file.write(" LastScript = " + script + "\n") - qlocaleh_temp_file.write(" };\n") + qlocaleh_temp_file.write(" };\n\n") # Country enum qlocaleh_temp_file.write(" enum Country {\n") -- cgit v1.2.3 From 4e84a8b29f13169a75c734920e953d3157768bca Mon Sep 17 00:00:00 2001 From: Edward Welbourne Date: Thu, 9 Jan 2020 20:47:23 +0100 Subject: Deduplicate locale data tables Some entries in tables were sub-strings (e.g. prefixes) of others. Since we store start-index and length (with no need for terminators), any entry that appears as a sub-string of an earlier entry can be recorded without making a separate copy of its content, just by recording where it appeared as a sub-string of an earlier entry. (Sadly this doesn't apply to month- or day-names and their short-forms: for those, we store ';'-joined lists. Thus, although each short-form is a prefix of its long-form, the short-form is stored in a list with other short-forms; and this is not a prefix of the list of matching long-forms.) The savings are modest (780 bytes at present), but cost us nothing except when running the python script that generates the data files (it takes a little longer now), which usually only happens at a CLDR update. Change-Id: I05bdaa9283365707bac0190ae983b31f074dd6ed Reviewed-by: Lars Knoll --- util/locale_database/qlocalexml2cpp.py | 58 +++++++++++++++++++++------------- 1 file changed, 36 insertions(+), 22 deletions(-) (limited to 'util/locale_database/qlocalexml2cpp.py') diff --git a/util/locale_database/qlocalexml2cpp.py b/util/locale_database/qlocalexml2cpp.py index bf58683637..e5e5cccbff 100755 --- a/util/locale_database/qlocalexml2cpp.py +++ b/util/locale_database/qlocalexml2cpp.py @@ -1,7 +1,7 @@ #!/usr/bin/env python2 ############################################################################# ## -## Copyright (C) 2018 The Qt Company Ltd. +## Copyright (C) 2020 The Qt Company Ltd. ## Contact: https://www.qt.io/licensing/ ## ## This file is part of the test suite of the Qt Toolkit. @@ -272,32 +272,46 @@ class StringData: self.data = [] self.hash = {} self.name = name + self.text = '' # Used in quick-search for matches in data def append(self, s): - if s in self.hash: - return self.hash[s] - - lst = unicode2hex(s) - index = len(self.data) - if index > 65535: - print "\n\n\n#error Data index is too big!" - sys.stderr.write ("\n\n\nERROR: index exceeds the uint16 range! index = %d\n" % index) - sys.exit(1) - size = len(lst) - if size >= 65535: - print "\n\n\n#error Data is too big!" - sys.stderr.write ("\n\n\nERROR: data size exceeds the uint16 range! size = %d\n" % size) - sys.exit(1) - token = None try: - token = StringDataToken(index, size) - except Error as e: - sys.stderr.write("\n\n\nERROR: %s: on data '%s'" % (e, s)) - sys.exit(1) - self.hash[s] = token - self.data += lst + token = self.hash[s] + except KeyError: + token = self.__store(s) + self.hash[s] = token return token + def __store(self, s): + """Add string s to known data. + + Seeks to avoid duplication, where possible. + For example, short-forms may be prefixes of long-forms. + """ + if not s: + return StringDataToken(0, 0) + ucs2 = unicode2hex(s) + try: + index = self.text.index(s) - 1 + matched = 0 + while matched < len(ucs2): + index, matched = self.data.index(ucs2[0], index + 1), 1 + if index + len(ucs2) >= len(self.data): + raise ValueError # not found after all ! + while matched < len(ucs2) and self.data[index + matched] == ucs2[matched]: + matched += 1 + except ValueError: + index = len(self.data) + self.data += ucs2 + self.text += s + + assert index >= 0 + try: + return StringDataToken(index, len(ucs2)) + except ValueError as e: + e.args += (self.name, s) + raise + def write(self, fd): fd.write("\nstatic const ushort %s[] = {\n" % self.name) fd.write(wrap_list(self.data)) -- cgit v1.2.3 From c08a31634fd8d25d14aed4a73a80f44f254163f3 Mon Sep 17 00:00:00 2001 From: Edward Welbourne Date: Thu, 9 Jan 2020 14:48:21 +0100 Subject: Separate offsets from sizes in QLocale's data This enables us to make the sizes quint8 and benefit from the resulting packing, making the locale data smaller. The sizes for long month-name lists (which concatenate twelve names with semicolon as separator) can overflow an 8-bit member, so use quint16 where needed. Re-ordered the data in QLocaleData and QCalendarLocale. Now all long-short(-narrow) families arise in that order; and any standalone is grouped with the one of the same length. (This cost 20 bytes in the date-format table, which optimises out more duplication if short is before long, but the saving in the (smaller) time-format table more than make up for it; and 20 bytes isn't worth the confusion that being inconsistent in ordering might cause.) At the same time, drop trailing semicolons from list entries (which join various names with semicolon) as they're not needed: we know where the end of the list is, because we know the size of the string that results from concatenation. The code that parses such lists can even correctly handle empty entries at the end. Saves 26 kB of data in the compiled binaries. Task-number: QTBUG-81053 Change-Id: If6ccc96a6910828817aa605d10fd814f567ae1e8 Reviewed-by: Thiago Macieira Reviewed-by: Lars Knoll --- util/locale_database/qlocalexml2cpp.py | 210 ++++++++++++++++++--------------- 1 file changed, 116 insertions(+), 94 deletions(-) (limited to 'util/locale_database/qlocalexml2cpp.py') diff --git a/util/locale_database/qlocalexml2cpp.py b/util/locale_database/qlocalexml2cpp.py index e5e5cccbff..0cfa0f03e4 100755 --- a/util/locale_database/qlocalexml2cpp.py +++ b/util/locale_database/qlocalexml2cpp.py @@ -259,13 +259,16 @@ def unicode2hex(s): return lst class StringDataToken: - def __init__(self, index, length): - if index > 0xFFFF or length > 0xFFFF: - raise Error("Position exceeds ushort range: %d,%d " % (index, length)) + def __init__(self, index, length, bits): + if index > 0xffff: + print "\n\n\n#error Data index is too big!", index + raise ValueError("Start-index (%d) exceeds the uint16 range!" % index) + if length >= (1 << bits): + print "\n\n\n#error Range length is too big!", length + raise ValueError("Data size (%d) exceeds the %d-bit range!" % (length, bits)) + self.index = index self.length = length - def __str__(self): - return " %d,%d " % (self.index, self.length) class StringData: def __init__(self, name): @@ -274,22 +277,22 @@ class StringData: self.name = name self.text = '' # Used in quick-search for matches in data - def append(self, s): + def append(self, s, bits=8): try: token = self.hash[s] except KeyError: - token = self.__store(s) + token = self.__store(s, bits) self.hash[s] = token return token - def __store(self, s): + def __store(self, s, bits): """Add string s to known data. Seeks to avoid duplication, where possible. For example, short-forms may be prefixes of long-forms. """ if not s: - return StringDataToken(0, 0) + return StringDataToken(0, 0, bits) ucs2 = unicode2hex(s) try: index = self.text.index(s) - 1 @@ -307,12 +310,15 @@ class StringData: assert index >= 0 try: - return StringDataToken(index, len(ucs2)) + return StringDataToken(index, len(ucs2), bits) except ValueError as e: e.args += (self.name, s) raise def write(self, fd): + if len(self.data) > 0xffff: + raise ValueError("Data is too big for quint16 index to its end!" % len(self.data), + self.name) fd.write("\nstatic const ushort %s[] = {\n" % self.name) fd.write(wrap_list(self.data)) fd.write("\n};\n") @@ -498,39 +504,43 @@ def main(): + ' quotEnd ' + 'altQtOpn ' + 'altQtEnd ' - # Width 11 + comma: - + ' lpStart ' # List pattern - + ' lpMid ' - + ' lpEnd ' - + ' lpTwo ' - + ' sDtFmt ' # Date format - + ' lDtFmt ' - + ' sTmFmt ' # Time format - + ' lTmFmt ' - + ' ssDays ' # Days - + ' slDays ' - + ' snDays ' - + ' sDays ' - + ' lDays ' - + ' nDays ' - + ' am ' # am/pm indicators - + ' pm ' - # Width 8 + comma - + ' byte ' - + ' siQuant ' - + 'iecQuant ' + + # Range entries (all start-indices, then all sizes): + # Width 5 + comma: + + 'lStrt ' # List pattern + + 'lpMid ' + + 'lpEnd ' + + 'lPair ' + + 'lDFmt ' # Date format + + 'sDFmt ' + + 'lTFmt ' # Time format + + 'sTFmt ' + + 'slDay ' # Day names + + 'lDays ' + + 'ssDys ' + + 'sDays ' + + 'snDay ' + + 'nDays ' + + ' am ' # am/pm indicators + + ' pm ' + + ' byte ' + + 'siQnt ' + + 'iecQn ' + + 'crSym ' # Currency formatting: + + 'crDsp ' + + 'crFmt ' + + 'crFNg ' + + 'ntLng ' # Name of language in itself, and of territory: + + 'ntTer ' + # Width 3 + comma for each size; no header + + ' ' * 25 + + # Strays (char array, bit-fields): # Width 8+4 + comma + ' currISO ' - # Width 11 + comma: - + ' currSym ' # Currency formatting: - + ' currDsply ' - + ' currFmt ' - + ' currFmtNeg ' - + ' endoLang ' # Name of language in itself, and of country: - + ' endoCntry ' # Width 6 + comma: - + 'curDgt ' # Currency number representation: - + 'curRnd ' + + 'curDgt ' # Currency digits + + 'curRnd ' # Currencty rounding (unused: QTBUG-81343) + 'dow1st ' # First day of week + ' wknd+ ' # Week-end start/end days: + ' wknd-' @@ -550,14 +560,16 @@ def main(): + '%6d,' * 8 # Quotation marks: + '%8d,' * 4 + # List patterns, date/time formats, month/day names, am/pm: - + '%11s,' * 16 # SI/IEC byte-unit abbreviations: - + '%8s,' * 3 + # Currency and endonyms + + '%5d,' * 25 + # Sizes for the same: + + '%3d,' * 25 + # Currency ISO code: + ' %10s, ' - # Currency and endonyms - + '%11s,' * 6 # Currency formatting: + '%6d,%6d' # Day of week and week-end: @@ -565,8 +577,32 @@ def main(): + ' }') for key in locale_keys: l = locale_map[key] + # Sequence of StringDataToken: + ranges = (tuple(list_pattern_part_data.append(p) for p in # 4 entries: + (l.listPatternPartStart, l.listPatternPartMiddle, + l.listPatternPartEnd, l.listPatternPartTwo)) + + tuple (date_format_data.append(f) for f in # 2 entries: + (l.longDateFormat, l.shortDateFormat)) + + tuple(time_format_data.append(f) for f in # 2 entries: + (l.longTimeFormat, l.shortTimeFormat)) + + tuple(days_data.append(d) for d in # 6 entries: + (l.standaloneLongDays, l.longDays, + l.standaloneShortDays, l.shortDays, + l.standaloneNarrowDays, l.narrowDays)) + + (am_data.append(l.am), pm_data.append(l.pm)) + # 2 entries: + tuple(byte_unit_data.append(b) for b in # 3 entries: + (l.byte_unit, l.byte_si_quantified, l.byte_iec_quantified)) + + (currency_symbol_data.append(l.currencySymbol), + currency_display_name_data.append(l.currencyDisplayName), + currency_format_data.append(l.currencyFormat), + currency_format_data.append(l.currencyNegativeFormat), + endonyms_data.append(l.languageEndonym), + endonyms_data.append(l.countryEndonym)) # 6 entries + ) # Total: 25 entries + assert len(ranges) == 25 + data_temp_file.write(line_format - % (key[0], key[1], key[2], + % ((key[0], key[1], key[2], l.decimal, l.group, l.listDelim, @@ -578,43 +614,21 @@ def main(): l.quotationStart, l.quotationEnd, l.alternateQuotationStart, - l.alternateQuotationEnd, - list_pattern_part_data.append(l.listPatternPartStart), - list_pattern_part_data.append(l.listPatternPartMiddle), - list_pattern_part_data.append(l.listPatternPartEnd), - list_pattern_part_data.append(l.listPatternPartTwo), - date_format_data.append(l.shortDateFormat), - date_format_data.append(l.longDateFormat), - time_format_data.append(l.shortTimeFormat), - time_format_data.append(l.longTimeFormat), - days_data.append(l.standaloneShortDays), - days_data.append(l.standaloneLongDays), - days_data.append(l.standaloneNarrowDays), - days_data.append(l.shortDays), - days_data.append(l.longDays), - days_data.append(l.narrowDays), - am_data.append(l.am), - pm_data.append(l.pm), - byte_unit_data.append(l.byte_unit), - byte_unit_data.append(l.byte_si_quantified), - byte_unit_data.append(l.byte_iec_quantified), - currencyIsoCodeData(l.currencyIsoCode), - currency_symbol_data.append(l.currencySymbol), - currency_display_name_data.append(l.currencyDisplayName), - currency_format_data.append(l.currencyFormat), - currency_format_data.append(l.currencyNegativeFormat), - endonyms_data.append(l.languageEndonym), - endonyms_data.append(l.countryEndonym), + l.alternateQuotationEnd) + + tuple(r.index for r in ranges) + + tuple(r.length for r in ranges) + + (currencyIsoCodeData(l.currencyIsoCode), l.currencyDigits, l.currencyRounding, # unused (QTBUG-81343) l.firstDayOfWeek, l.weekendStart, - l.weekendEnd) + l.weekendEnd)) + ", // %s/%s/%s\n" % (l.language, l.script, l.country)) data_temp_file.write(line_format # All zeros, matching the format: - % ( (0,) * (3 + 8 + 4) + ("0,0",) * (16 + 3) + % ( (0,) * (3 + 8 + 4) + (0,) * 25 * 2 + (currencyIsoCodeData(0),) - + ("0,0",) * 6 + (0,) * (2 + 3)) + + (0,) * 2 + + (0,) * 3) + " // trailing zeros\n") data_temp_file.write("};\n") @@ -750,7 +764,7 @@ def main(): os.rename(data_temp_file_path, qtsrcdir + "/src/corelib/text/qlocale_data_p.h") # Generate calendar data - calendar_format = ' {%6d,%6d,%6d,{%5s},{%5s},{%5s},{%5s},{%5s},{%5s}}, ' + calendar_format = ' {%6d,%6d,%6d' + ',%5d' * 6 + ',%3d' * 6 + ' },' for calendar, stem in calendars.items(): months_data = StringData('months_data') calendar_data_file = "q%scalendar_data_p.h" % stem @@ -770,30 +784,38 @@ def main(): + ' lang ' + ' script' + ' terr ' - # Month-name start-end pairs, width 8 (5 plus '{},'): - + ' sShort ' - + ' sLong ' - + ' sNarrow' - + ' short ' - + ' long ' - + ' narrow' - # No trailing space on last; be sure - # to pad before adding later entries. + # Month-name start-indices, width 6 (5 + comma): + + 'sLng ' + + 'long ' + + 'sSrt ' + + 'shrt ' + + 'sNrw ' + + 'naro ' + # No individual headers for the sizes. + + 'Sizes...' + '\n') for key in locale_keys: l = locale_map[key] + # Sequence of StringDataToken: + try: + # Twelve long month names can add up to more than 256 (e.g. kde_TZ: 264) + ranges = (tuple(months_data.append(m[calendar], 16) for m in + (l.standaloneLongMonths, l.longMonths)) + + tuple(months_data.append(m[calendar]) for m in + (l.standaloneShortMonths, l.shortMonths, + l.standaloneNarrowMonths, l.narrowMonths))) + except ValueError as e: + e.args += (l.language, l.script, l.country, stem) + raise + calendar_temp_file.write( calendar_format - % (key[0], key[1], key[2], - months_data.append(l.standaloneShortMonths[calendar]), - months_data.append(l.standaloneLongMonths[calendar]), - months_data.append(l.standaloneNarrowMonths[calendar]), - months_data.append(l.shortMonths[calendar]), - months_data.append(l.longMonths[calendar]), - months_data.append(l.narrowMonths[calendar])) + % ((key[0], key[1], key[2]) + + tuple(r.index for r in ranges) + + tuple(r.length for r in ranges)) + "// %s/%s/%s\n" % (l.language, l.script, l.country)) - calendar_temp_file.write(calendar_format % ( (0,) * 3 + ('0,0',) * 6 ) - + '// trailing zeros\n') + calendar_temp_file.write(calendar_format % ( (0,) * (3 + 6 * 2) ) + + '// trailing zeros\n') calendar_temp_file.write("};\n") months_data.write(calendar_temp_file) s = calendar_template_file.readline() -- cgit v1.2.3