From 47d94dab0fbc428e5c3401c411ac01a05e5e851f Mon Sep 17 00:00:00 2001
From: Edward Welbourne <edward.welbourne@qt.io>
Date: Thu, 9 Jan 2020 20:36:58 +0100
Subject: Minor tidy-up in qlocalexml2cpp.py

Split a long line.
Use pythonic chained comparison to save some repetition.
Comment on a field not currently in actual use.
Say "zeros" rather than "0s" in one comment to match another.
Added a .h suffix to the main locale data tempfile to match the naming
of the tempfiles used for calendar data.

Simplify generation of the blank line between Language and Script; and
include a matching blank between Script and Country.
This adds one blank line to qlocale.h

Removed a stray space that misaligned locale data lines.
This produces a space-only change in the generated *_data_p.h files.

Change-Id: I974a9e8923c3dfd2178855d2cf1d6a5074e130b3
Reviewed-by: Lars Knoll <lars.knoll@qt.io>
---
 util/locale_database/qlocalexml2cpp.py | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

(limited to 'util/locale_database/qlocalexml2cpp.py')

diff --git a/util/locale_database/qlocalexml2cpp.py b/util/locale_database/qlocalexml2cpp.py
index 52e6331569..bf58683637 100755
--- a/util/locale_database/qlocalexml2cpp.py
+++ b/util/locale_database/qlocalexml2cpp.py
@@ -319,7 +319,7 @@ def escapedString(s):
     need_escape = False
     result = ""
     for c in s:
-        if ord(c) < 128 and (not need_escape or ord(c.lower()) < ord('a') or ord(c.lower()) > ord('f')):
+        if ord(c) < 128 and not (need_escape and ord('a') <= ord(c.lower()) <= ord('f')):
             line += c
             need_escape = False
         else:
@@ -361,7 +361,7 @@ def main():
                     for leaf in ('qlocale_data_p.h', 'qlocale.h', 'qlocale.qdoc'))):
         usage()
 
-    (data_temp_file, data_temp_file_path) = tempfile.mkstemp("qlocale_data_p", dir=qtsrcdir)
+    (data_temp_file, data_temp_file_path) = tempfile.mkstemp("qlocale_data_p.h", dir=qtsrcdir)
     data_temp_file = os.fdopen(data_temp_file, "w")
     qlocaledata_file = open(qtsrcdir + "/src/corelib/text/qlocale_data_p.h", "r")
     s = qlocaledata_file.readline()
@@ -426,7 +426,8 @@ def main():
             cmnt_to = cmnt_to + country_map[to_country][1]
 
         data_temp_file.write("    ")
-        data_temp_file.write("{ %3d, %3d, %3d }, { %3d, %3d, %3d }" % (from_language, from_script, from_country, to_language, to_script, to_country))
+        data_temp_file.write("{ %3d, %3d, %3d }, { %3d, %3d, %3d }" %
+                             (from_language, from_script, from_country, to_language, to_script, to_country))
         index += 1
         if index != len(likely_subtags_map):
             data_temp_file.write(",")
@@ -591,7 +592,7 @@ def main():
                         endonyms_data.append(l.languageEndonym),
                         endonyms_data.append(l.countryEndonym),
                         l.currencyDigits,
-                        l.currencyRounding,
+                        l.currencyRounding, # unused (QTBUG-81343)
                         l.firstDayOfWeek,
                         l.weekendStart,
                         l.weekendEnd)
@@ -600,7 +601,7 @@ def main():
                          % ( (0,) * (3 + 8 + 4) + ("0,0",) * (16 + 3)
                              + (currencyIsoCodeData(0),)
                              + ("0,0",) * 6 + (0,) * (2 + 3))
-                         + " // trailing 0s\n")
+                         + " // trailing zeros\n")
     data_temp_file.write("};\n")
 
     # StringData tables:
@@ -776,7 +777,7 @@ def main():
                    months_data.append(l.shortMonths[calendar]),
                    months_data.append(l.longMonths[calendar]),
                    months_data.append(l.narrowMonths[calendar]))
-                + "// %s/%s/%s\n " % (l.language, l.script, l.country))
+                + "// %s/%s/%s\n" % (l.language, l.script, l.country))
         calendar_temp_file.write(calendar_format % ( (0,) * 3 + ('0,0',) * 6 )
                                       + '// trailing zeros\n')
         calendar_temp_file.write("};\n")
@@ -815,9 +816,7 @@ def main():
                              ",\n")
     qlocaleh_temp_file.write("\n")
     qlocaleh_temp_file.write("        LastLanguage = " + language + "\n")
-    qlocaleh_temp_file.write("    };\n")
-
-    qlocaleh_temp_file.write("\n")
+    qlocaleh_temp_file.write("    };\n\n")
 
     # Script enum
     qlocaleh_temp_file.write("    enum Script {\n")
@@ -831,7 +830,7 @@ def main():
                              ",\n")
     qlocaleh_temp_file.write("\n")
     qlocaleh_temp_file.write("        LastScript = " + script + "\n")
-    qlocaleh_temp_file.write("    };\n")
+    qlocaleh_temp_file.write("    };\n\n")
 
     # Country enum
     qlocaleh_temp_file.write("    enum Country {\n")
-- 
cgit v1.2.3


From 4e84a8b29f13169a75c734920e953d3157768bca Mon Sep 17 00:00:00 2001
From: Edward Welbourne <edward.welbourne@qt.io>
Date: Thu, 9 Jan 2020 20:47:23 +0100
Subject: Deduplicate locale data tables

Some entries in tables were sub-strings (e.g. prefixes) of others.
Since we store start-index and length (with no need for terminators),
any entry that appears as a sub-string of an earlier entry can be
recorded without making a separate copy of its content, just by
recording where it appeared as a sub-string of an earlier entry.

(Sadly this doesn't apply to month- or day-names and their
short-forms: for those, we store ';'-joined lists.  Thus, although
each short-form is a prefix of its long-form, the short-form is stored
in a list with other short-forms; and this is not a prefix of the list
of matching long-forms.)

The savings are modest (780 bytes at present), but cost us nothing
except when running the python script that generates the data files
(it takes a little longer now), which usually only happens at a CLDR
update.

Change-Id: I05bdaa9283365707bac0190ae983b31f074dd6ed
Reviewed-by: Lars Knoll <lars.knoll@qt.io>
---
 util/locale_database/qlocalexml2cpp.py | 58 +++++++++++++++++++++-------------
 1 file changed, 36 insertions(+), 22 deletions(-)

(limited to 'util/locale_database/qlocalexml2cpp.py')

diff --git a/util/locale_database/qlocalexml2cpp.py b/util/locale_database/qlocalexml2cpp.py
index bf58683637..e5e5cccbff 100755
--- a/util/locale_database/qlocalexml2cpp.py
+++ b/util/locale_database/qlocalexml2cpp.py
@@ -1,7 +1,7 @@
 #!/usr/bin/env python2
 #############################################################################
 ##
-## Copyright (C) 2018 The Qt Company Ltd.
+## Copyright (C) 2020 The Qt Company Ltd.
 ## Contact: https://www.qt.io/licensing/
 ##
 ## This file is part of the test suite of the Qt Toolkit.
@@ -272,32 +272,46 @@ class StringData:
         self.data = []
         self.hash = {}
         self.name = name
+        self.text = '' # Used in quick-search for matches in data
 
     def append(self, s):
-        if s in self.hash:
-            return self.hash[s]
-
-        lst = unicode2hex(s)
-        index = len(self.data)
-        if index > 65535:
-            print "\n\n\n#error Data index is too big!"
-            sys.stderr.write ("\n\n\nERROR: index exceeds the uint16 range! index = %d\n" % index)
-            sys.exit(1)
-        size = len(lst)
-        if size >= 65535:
-            print "\n\n\n#error Data is too big!"
-            sys.stderr.write ("\n\n\nERROR: data size exceeds the uint16 range! size = %d\n" % size)
-            sys.exit(1)
-        token = None
         try:
-            token = StringDataToken(index, size)
-        except Error as e:
-            sys.stderr.write("\n\n\nERROR: %s: on data '%s'" % (e, s))
-            sys.exit(1)
-        self.hash[s] = token
-        self.data += lst
+            token = self.hash[s]
+        except KeyError:
+            token = self.__store(s)
+            self.hash[s] = token
         return token
 
+    def __store(self, s):
+        """Add string s to known data.
+
+        Seeks to avoid duplication, where possible.
+        For example, short-forms may be prefixes of long-forms.
+        """
+        if not s:
+            return StringDataToken(0, 0)
+        ucs2 = unicode2hex(s)
+        try:
+            index = self.text.index(s) - 1
+            matched = 0
+            while matched < len(ucs2):
+                index, matched = self.data.index(ucs2[0], index + 1), 1
+                if index + len(ucs2) >= len(self.data):
+                    raise ValueError # not found after all !
+                while matched < len(ucs2) and self.data[index + matched] == ucs2[matched]:
+                    matched += 1
+        except ValueError:
+            index = len(self.data)
+            self.data += ucs2
+            self.text += s
+
+        assert index >= 0
+        try:
+            return StringDataToken(index, len(ucs2))
+        except ValueError as e:
+            e.args += (self.name, s)
+            raise
+
     def write(self, fd):
         fd.write("\nstatic const ushort %s[] = {\n" % self.name)
         fd.write(wrap_list(self.data))
-- 
cgit v1.2.3


From c08a31634fd8d25d14aed4a73a80f44f254163f3 Mon Sep 17 00:00:00 2001
From: Edward Welbourne <edward.welbourne@qt.io>
Date: Thu, 9 Jan 2020 14:48:21 +0100
Subject: Separate offsets from sizes in QLocale's data

This enables us to make the sizes quint8 and benefit from the
resulting packing, making the locale data smaller. The sizes for long
month-name lists (which concatenate twelve names with semicolon as
separator) can overflow an 8-bit member, so use quint16 where needed.

Re-ordered the data in QLocaleData and QCalendarLocale. Now all
long-short(-narrow) families arise in that order; and any standalone
is grouped with the one of the same length. (This cost 20 bytes in the
date-format table, which optimises out more duplication if short is
before long, but the saving in the (smaller) time-format table more
than make up for it; and 20 bytes isn't worth the confusion that being
inconsistent in ordering might cause.)

At the same time, drop trailing semicolons from list entries (which
join various names with semicolon) as they're not needed: we know
where the end of the list is, because we know the size of the string
that results from concatenation. The code that parses such lists can
even correctly handle empty entries at the end.

Saves 26 kB of data in the compiled binaries.

Task-number: QTBUG-81053
Change-Id: If6ccc96a6910828817aa605d10fd814f567ae1e8
Reviewed-by: Thiago Macieira <thiago.macieira@intel.com>
Reviewed-by: Lars Knoll <lars.knoll@qt.io>
---
 util/locale_database/qlocalexml2cpp.py | 210 ++++++++++++++++++---------------
 1 file changed, 116 insertions(+), 94 deletions(-)

(limited to 'util/locale_database/qlocalexml2cpp.py')

diff --git a/util/locale_database/qlocalexml2cpp.py b/util/locale_database/qlocalexml2cpp.py
index e5e5cccbff..0cfa0f03e4 100755
--- a/util/locale_database/qlocalexml2cpp.py
+++ b/util/locale_database/qlocalexml2cpp.py
@@ -259,13 +259,16 @@ def unicode2hex(s):
     return lst
 
 class StringDataToken:
-    def __init__(self, index, length):
-        if index > 0xFFFF or length > 0xFFFF:
-            raise Error("Position exceeds ushort range: %d,%d " % (index, length))
+    def __init__(self, index, length, bits):
+        if index > 0xffff:
+            print "\n\n\n#error Data index is too big!", index
+            raise ValueError("Start-index (%d) exceeds the uint16 range!" % index)
+        if length >= (1 << bits):
+            print "\n\n\n#error Range length is too big!", length
+            raise ValueError("Data size (%d) exceeds the %d-bit range!" % (length, bits))
+
         self.index = index
         self.length = length
-    def __str__(self):
-        return " %d,%d " % (self.index, self.length)
 
 class StringData:
     def __init__(self, name):
@@ -274,22 +277,22 @@ class StringData:
         self.name = name
         self.text = '' # Used in quick-search for matches in data
 
-    def append(self, s):
+    def append(self, s, bits=8):
         try:
             token = self.hash[s]
         except KeyError:
-            token = self.__store(s)
+            token = self.__store(s, bits)
             self.hash[s] = token
         return token
 
-    def __store(self, s):
+    def __store(self, s, bits):
         """Add string s to known data.
 
         Seeks to avoid duplication, where possible.
         For example, short-forms may be prefixes of long-forms.
         """
         if not s:
-            return StringDataToken(0, 0)
+            return StringDataToken(0, 0, bits)
         ucs2 = unicode2hex(s)
         try:
             index = self.text.index(s) - 1
@@ -307,12 +310,15 @@ class StringData:
 
         assert index >= 0
         try:
-            return StringDataToken(index, len(ucs2))
+            return StringDataToken(index, len(ucs2), bits)
         except ValueError as e:
             e.args += (self.name, s)
             raise
 
     def write(self, fd):
+        if len(self.data) > 0xffff:
+            raise ValueError("Data is too big for quint16 index to its end!" % len(self.data),
+                             self.name)
         fd.write("\nstatic const ushort %s[] = {\n" % self.name)
         fd.write(wrap_list(self.data))
         fd.write("\n};\n")
@@ -498,39 +504,43 @@ def main():
                          + ' quotEnd '
                          + 'altQtOpn '
                          + 'altQtEnd '
-                         # Width 11 + comma:
-                         + '  lpStart   ' # List pattern
-                         + '   lpMid    '
-                         + '   lpEnd    '
-                         + '   lpTwo    '
-                         + '   sDtFmt   ' # Date format
-                         + '   lDtFmt   '
-                         + '   sTmFmt   ' # Time format
-                         + '   lTmFmt   '
-                         + '   ssDays   ' # Days
-                         + '   slDays   '
-                         + '   snDays   '
-                         + '    sDays   '
-                         + '    lDays   '
-                         + '    nDays   '
-                         + '     am     ' # am/pm indicators
-                         + '     pm     '
-                         # Width 8 + comma
-                         + '  byte   '
-                         + ' siQuant '
-                         + 'iecQuant '
+
+                         # Range entries (all start-indices, then all sizes):
+                         # Width 5 + comma:
+                         + 'lStrt ' # List pattern
+                         + 'lpMid '
+                         + 'lpEnd '
+                         + 'lPair '
+                         + 'lDFmt ' # Date format
+                         + 'sDFmt '
+                         + 'lTFmt ' # Time format
+                         + 'sTFmt '
+                         + 'slDay ' # Day names
+                         + 'lDays '
+                         + 'ssDys '
+                         + 'sDays '
+                         + 'snDay '
+                         + 'nDays '
+                         + '  am  ' # am/pm indicators
+                         + '  pm  '
+                         + ' byte '
+                         + 'siQnt '
+                         + 'iecQn '
+                         + 'crSym ' # Currency formatting:
+                         + 'crDsp '
+                         + 'crFmt '
+                         + 'crFNg '
+                         + 'ntLng ' # Name of language in itself, and of territory:
+                         + 'ntTer '
+                         # Width 3 + comma for each size; no header
+                         + '    ' * 25
+
+                         # Strays (char array, bit-fields):
                          # Width 8+4 + comma
                          + '   currISO   '
-                         # Width 11 + comma:
-                         + '  currSym   ' # Currency formatting:
-                         + ' currDsply  '
-                         + '  currFmt   '
-                         + ' currFmtNeg '
-                         + '  endoLang  ' # Name of language in itself, and of country:
-                         + '  endoCntry '
                          # Width 6 + comma:
-                         + 'curDgt ' # Currency number representation:
-                         + 'curRnd '
+                         + 'curDgt ' # Currency digits
+                         + 'curRnd ' # Currencty rounding (unused: QTBUG-81343)
                          + 'dow1st ' # First day of week
                          + ' wknd+ ' # Week-end start/end days:
                          + ' wknd-'
@@ -550,14 +560,16 @@ def main():
                    + '%6d,' * 8
                    # Quotation marks:
                    + '%8d,' * 4
+
                    # List patterns, date/time formats, month/day names, am/pm:
-                   + '%11s,' * 16
                    # SI/IEC byte-unit abbreviations:
-                   + '%8s,' * 3
+                   # Currency and endonyms
+                   + '%5d,' * 25
+                   # Sizes for the same:
+                   + '%3d,' * 25
+
                    # Currency ISO code:
                    + ' %10s, '
-                   # Currency and endonyms
-                   + '%11s,' * 6
                    # Currency formatting:
                    + '%6d,%6d'
                    # Day of week and week-end:
@@ -565,8 +577,32 @@ def main():
                    + ' }')
     for key in locale_keys:
         l = locale_map[key]
+        # Sequence of StringDataToken:
+        ranges = (tuple(list_pattern_part_data.append(p) for p in # 4 entries:
+                        (l.listPatternPartStart, l.listPatternPartMiddle,
+                         l.listPatternPartEnd, l.listPatternPartTwo)) +
+                  tuple (date_format_data.append(f) for f in # 2 entries:
+                         (l.longDateFormat, l.shortDateFormat)) +
+                  tuple(time_format_data.append(f) for f in # 2 entries:
+                        (l.longTimeFormat, l.shortTimeFormat)) +
+                  tuple(days_data.append(d) for d in # 6 entries:
+                        (l.standaloneLongDays, l.longDays,
+                         l.standaloneShortDays, l.shortDays,
+                         l.standaloneNarrowDays, l.narrowDays)) +
+                  (am_data.append(l.am), pm_data.append(l.pm)) + # 2 entries:
+                  tuple(byte_unit_data.append(b) for b in # 3 entries:
+                        (l.byte_unit, l.byte_si_quantified, l.byte_iec_quantified)) +
+                  (currency_symbol_data.append(l.currencySymbol),
+                   currency_display_name_data.append(l.currencyDisplayName),
+                   currency_format_data.append(l.currencyFormat),
+                   currency_format_data.append(l.currencyNegativeFormat),
+                   endonyms_data.append(l.languageEndonym),
+                   endonyms_data.append(l.countryEndonym)) # 6 entries
+                  ) # Total: 25 entries
+        assert len(ranges) == 25
+
         data_temp_file.write(line_format
-                    % (key[0], key[1], key[2],
+                    % ((key[0], key[1], key[2],
                         l.decimal,
                         l.group,
                         l.listDelim,
@@ -578,43 +614,21 @@ def main():
                         l.quotationStart,
                         l.quotationEnd,
                         l.alternateQuotationStart,
-                        l.alternateQuotationEnd,
-                        list_pattern_part_data.append(l.listPatternPartStart),
-                        list_pattern_part_data.append(l.listPatternPartMiddle),
-                        list_pattern_part_data.append(l.listPatternPartEnd),
-                        list_pattern_part_data.append(l.listPatternPartTwo),
-                        date_format_data.append(l.shortDateFormat),
-                        date_format_data.append(l.longDateFormat),
-                        time_format_data.append(l.shortTimeFormat),
-                        time_format_data.append(l.longTimeFormat),
-                        days_data.append(l.standaloneShortDays),
-                        days_data.append(l.standaloneLongDays),
-                        days_data.append(l.standaloneNarrowDays),
-                        days_data.append(l.shortDays),
-                        days_data.append(l.longDays),
-                        days_data.append(l.narrowDays),
-                        am_data.append(l.am),
-                        pm_data.append(l.pm),
-                        byte_unit_data.append(l.byte_unit),
-                        byte_unit_data.append(l.byte_si_quantified),
-                        byte_unit_data.append(l.byte_iec_quantified),
-                        currencyIsoCodeData(l.currencyIsoCode),
-                        currency_symbol_data.append(l.currencySymbol),
-                        currency_display_name_data.append(l.currencyDisplayName),
-                        currency_format_data.append(l.currencyFormat),
-                        currency_format_data.append(l.currencyNegativeFormat),
-                        endonyms_data.append(l.languageEndonym),
-                        endonyms_data.append(l.countryEndonym),
+                        l.alternateQuotationEnd) +
+                       tuple(r.index for r in ranges) +
+                       tuple(r.length for r in ranges) +
+                       (currencyIsoCodeData(l.currencyIsoCode),
                         l.currencyDigits,
                         l.currencyRounding, # unused (QTBUG-81343)
                         l.firstDayOfWeek,
                         l.weekendStart,
-                        l.weekendEnd)
+                        l.weekendEnd))
                              + ", // %s/%s/%s\n" % (l.language, l.script, l.country))
     data_temp_file.write(line_format # All zeros, matching the format:
-                         % ( (0,) * (3 + 8 + 4) + ("0,0",) * (16 + 3)
+                         % ( (0,) * (3 + 8 + 4) + (0,) * 25 * 2
                              + (currencyIsoCodeData(0),)
-                             + ("0,0",) * 6 + (0,) * (2 + 3))
+                             + (0,) * 2
+                             + (0,) * 3)
                          + " // trailing zeros\n")
     data_temp_file.write("};\n")
 
@@ -750,7 +764,7 @@ def main():
     os.rename(data_temp_file_path, qtsrcdir + "/src/corelib/text/qlocale_data_p.h")
 
     # Generate calendar data
-    calendar_format = '      {%6d,%6d,%6d,{%5s},{%5s},{%5s},{%5s},{%5s},{%5s}}, '
+    calendar_format = '      {%6d,%6d,%6d' + ',%5d' * 6 + ',%3d' * 6 + ' },'
     for calendar, stem in calendars.items():
         months_data = StringData('months_data')
         calendar_data_file = "q%scalendar_data_p.h" % stem
@@ -770,30 +784,38 @@ def main():
                                  + ' lang  '
                                  + ' script'
                                  + ' terr  '
-                                 # Month-name start-end pairs, width 8 (5 plus '{},'):
-                                     + ' sShort '
-                                 + ' sLong  '
-                                 + ' sNarrow'
-                                 + ' short  '
-                                 + ' long   '
-                                 + ' narrow'
-                                 # No trailing space on last; be sure
-                                 # to pad before adding later entries.
+                                 # Month-name start-indices, width 6 (5 + comma):
+                                 + 'sLng '
+                                 + 'long '
+                                 + 'sSrt '
+                                 + 'shrt '
+                                 + 'sNrw '
+                                 + 'naro '
+                                 # No individual headers for the sizes.
+                                 + 'Sizes...'
                                  + '\n')
         for key in locale_keys:
             l = locale_map[key]
+            # Sequence of StringDataToken:
+            try:
+                # Twelve long month names can add up to more than 256 (e.g. kde_TZ: 264)
+                ranges = (tuple(months_data.append(m[calendar], 16) for m in
+                                (l.standaloneLongMonths, l.longMonths)) +
+                          tuple(months_data.append(m[calendar]) for m in
+                                (l.standaloneShortMonths, l.shortMonths,
+                                 l.standaloneNarrowMonths, l.narrowMonths)))
+            except ValueError as e:
+                e.args += (l.language, l.script, l.country, stem)
+                raise
+
             calendar_temp_file.write(
                 calendar_format
-                % (key[0], key[1], key[2],
-                   months_data.append(l.standaloneShortMonths[calendar]),
-                   months_data.append(l.standaloneLongMonths[calendar]),
-                   months_data.append(l.standaloneNarrowMonths[calendar]),
-                   months_data.append(l.shortMonths[calendar]),
-                   months_data.append(l.longMonths[calendar]),
-                   months_data.append(l.narrowMonths[calendar]))
+                % ((key[0], key[1], key[2]) +
+                   tuple(r.index for r in ranges) +
+                   tuple(r.length for r in ranges))
                 + "// %s/%s/%s\n" % (l.language, l.script, l.country))
-        calendar_temp_file.write(calendar_format % ( (0,) * 3 + ('0,0',) * 6 )
-                                      + '// trailing zeros\n')
+        calendar_temp_file.write(calendar_format % ( (0,) * (3 + 6 * 2) )
+                                 + '// trailing zeros\n')
         calendar_temp_file.write("};\n")
         months_data.write(calendar_temp_file)
         s = calendar_template_file.readline()
-- 
cgit v1.2.3