summaryrefslogtreecommitdiffstats
path: root/util/locale_database
diff options
context:
space:
mode:
authorEdward Welbourne <edward.welbourne@qt.io>2020-01-09 20:47:23 +0100
committerEdward Welbourne <edward.welbourne@qt.io>2020-01-30 17:58:15 +0100
commit4e84a8b29f13169a75c734920e953d3157768bca (patch)
treed8d67c2a2f10ec11a1ecdf18898727a627611622 /util/locale_database
parent264ed73052513015caafaf146286cf63aaa68b03 (diff)
Deduplicate locale data tables
Some entries in tables were sub-strings (e.g. prefixes) of others. Since we store start-index and length (with no need for terminators), any entry that appears as a sub-string of an earlier entry can be recorded without making a separate copy of its content, just by recording where it appeared as a sub-string of an earlier entry. (Sadly this doesn't apply to month- or day-names and their short-forms: for those, we store ';'-joined lists. Thus, although each short-form is a prefix of its long-form, the short-form is stored in a list with other short-forms; and this is not a prefix of the list of matching long-forms.) The savings are modest (780 bytes at present), but cost us nothing except when running the python script that generates the data files (it takes a little longer now), which usually only happens at a CLDR update. Change-Id: I05bdaa9283365707bac0190ae983b31f074dd6ed Reviewed-by: Lars Knoll <lars.knoll@qt.io>
Diffstat (limited to 'util/locale_database')
-rwxr-xr-xutil/locale_database/qlocalexml2cpp.py58
1 files changed, 36 insertions, 22 deletions
diff --git a/util/locale_database/qlocalexml2cpp.py b/util/locale_database/qlocalexml2cpp.py
index bf58683637..e5e5cccbff 100755
--- a/util/locale_database/qlocalexml2cpp.py
+++ b/util/locale_database/qlocalexml2cpp.py
@@ -1,7 +1,7 @@
#!/usr/bin/env python2
#############################################################################
##
-## Copyright (C) 2018 The Qt Company Ltd.
+## Copyright (C) 2020 The Qt Company Ltd.
## Contact: https://www.qt.io/licensing/
##
## This file is part of the test suite of the Qt Toolkit.
@@ -272,32 +272,46 @@ class StringData:
self.data = []
self.hash = {}
self.name = name
+ self.text = '' # Used in quick-search for matches in data
def append(self, s):
- if s in self.hash:
- return self.hash[s]
-
- lst = unicode2hex(s)
- index = len(self.data)
- if index > 65535:
- print "\n\n\n#error Data index is too big!"
- sys.stderr.write ("\n\n\nERROR: index exceeds the uint16 range! index = %d\n" % index)
- sys.exit(1)
- size = len(lst)
- if size >= 65535:
- print "\n\n\n#error Data is too big!"
- sys.stderr.write ("\n\n\nERROR: data size exceeds the uint16 range! size = %d\n" % size)
- sys.exit(1)
- token = None
try:
- token = StringDataToken(index, size)
- except Error as e:
- sys.stderr.write("\n\n\nERROR: %s: on data '%s'" % (e, s))
- sys.exit(1)
- self.hash[s] = token
- self.data += lst
+ token = self.hash[s]
+ except KeyError:
+ token = self.__store(s)
+ self.hash[s] = token
return token
+ def __store(self, s):
+ """Add string s to known data.
+
+ Seeks to avoid duplication, where possible.
+ For example, short-forms may be prefixes of long-forms.
+ """
+ if not s:
+ return StringDataToken(0, 0)
+ ucs2 = unicode2hex(s)
+ try:
+ index = self.text.index(s) - 1
+ matched = 0
+ while matched < len(ucs2):
+ index, matched = self.data.index(ucs2[0], index + 1), 1
+ if index + len(ucs2) >= len(self.data):
+ raise ValueError # not found after all !
+ while matched < len(ucs2) and self.data[index + matched] == ucs2[matched]:
+ matched += 1
+ except ValueError:
+ index = len(self.data)
+ self.data += ucs2
+ self.text += s
+
+ assert index >= 0
+ try:
+ return StringDataToken(index, len(ucs2))
+ except ValueError as e:
+ e.args += (self.name, s)
+ raise
+
def write(self, fd):
fd.write("\nstatic const ushort %s[] = {\n" % self.name)
fd.write(wrap_list(self.data))