diff options
author | Edward Welbourne <edward.welbourne@qt.io> | 2020-01-09 20:47:23 +0100 |
---|---|---|
committer | Edward Welbourne <edward.welbourne@qt.io> | 2020-01-30 17:58:15 +0100 |
commit | 4e84a8b29f13169a75c734920e953d3157768bca (patch) | |
tree | d8d67c2a2f10ec11a1ecdf18898727a627611622 /util/locale_database/qlocalexml2cpp.py | |
parent | 264ed73052513015caafaf146286cf63aaa68b03 (diff) |
Deduplicate locale data tables
Some entries in tables were sub-strings (e.g. prefixes) of others.
Since we store start-index and length (with no need for terminators),
any entry that appears as a sub-string of an earlier entry can be
recorded without making a separate copy of its content, just by
recording where it appeared as a sub-string of an earlier entry.
(Sadly this doesn't apply to month- or day-names and their
short-forms: for those, we store ';'-joined lists. Thus, although
each short-form is a prefix of its long-form, the short-form is stored
in a list with other short-forms; and this is not a prefix of the list
of matching long-forms.)
The savings are modest (780 bytes at present), but cost us nothing
except when running the python script that generates the data files
(it takes a little longer now), which usually only happens at a CLDR
update.
Change-Id: I05bdaa9283365707bac0190ae983b31f074dd6ed
Reviewed-by: Lars Knoll <lars.knoll@qt.io>
Diffstat (limited to 'util/locale_database/qlocalexml2cpp.py')
-rwxr-xr-x | util/locale_database/qlocalexml2cpp.py | 58 |
1 files changed, 36 insertions, 22 deletions
diff --git a/util/locale_database/qlocalexml2cpp.py b/util/locale_database/qlocalexml2cpp.py index bf58683637..e5e5cccbff 100755 --- a/util/locale_database/qlocalexml2cpp.py +++ b/util/locale_database/qlocalexml2cpp.py @@ -1,7 +1,7 @@ #!/usr/bin/env python2 ############################################################################# ## -## Copyright (C) 2018 The Qt Company Ltd. +## Copyright (C) 2020 The Qt Company Ltd. ## Contact: https://www.qt.io/licensing/ ## ## This file is part of the test suite of the Qt Toolkit. @@ -272,32 +272,46 @@ class StringData: self.data = [] self.hash = {} self.name = name + self.text = '' # Used in quick-search for matches in data def append(self, s): - if s in self.hash: - return self.hash[s] - - lst = unicode2hex(s) - index = len(self.data) - if index > 65535: - print "\n\n\n#error Data index is too big!" - sys.stderr.write ("\n\n\nERROR: index exceeds the uint16 range! index = %d\n" % index) - sys.exit(1) - size = len(lst) - if size >= 65535: - print "\n\n\n#error Data is too big!" - sys.stderr.write ("\n\n\nERROR: data size exceeds the uint16 range! size = %d\n" % size) - sys.exit(1) - token = None try: - token = StringDataToken(index, size) - except Error as e: - sys.stderr.write("\n\n\nERROR: %s: on data '%s'" % (e, s)) - sys.exit(1) - self.hash[s] = token - self.data += lst + token = self.hash[s] + except KeyError: + token = self.__store(s) + self.hash[s] = token return token + def __store(self, s): + """Add string s to known data. + + Seeks to avoid duplication, where possible. + For example, short-forms may be prefixes of long-forms. + """ + if not s: + return StringDataToken(0, 0) + ucs2 = unicode2hex(s) + try: + index = self.text.index(s) - 1 + matched = 0 + while matched < len(ucs2): + index, matched = self.data.index(ucs2[0], index + 1), 1 + if index + len(ucs2) >= len(self.data): + raise ValueError # not found after all ! + while matched < len(ucs2) and self.data[index + matched] == ucs2[matched]: + matched += 1 + except ValueError: + index = len(self.data) + self.data += ucs2 + self.text += s + + assert index >= 0 + try: + return StringDataToken(index, len(ucs2)) + except ValueError as e: + e.args += (self.name, s) + raise + def write(self, fd): fd.write("\nstatic const ushort %s[] = {\n" % self.name) fd.write(wrap_list(self.data)) |