Deduplicate locale data tables

Some entries in tables were sub-strings (e.g. prefixes) of others. Since we store start-index and length (with no need for terminators), any entry that appears as a sub-string of an earlier entry can be recorded without making a separate copy of its content, just by recording where it appeared as a sub-string of an earlier entry. (Sadly this doesn't apply to month- or day-names and their short-forms: for those, we store ';'-joined lists. Thus, although each short-form is a prefix of its long-form, the short-form is stored in a list with other short-forms; and this is not a prefix of the list of matching long-forms.) The savings are modest (780 bytes at present), but cost us nothing except when running the python script that generates the data files (it takes a little longer now), which usually only happens at a CLDR update. Change-Id: I05bdaa9283365707bac0190ae983b31f074dd6ed Reviewed-by: Lars Knoll <lars.knoll@qt.io>
author: Edward Welbourne <edward.welbourne@qt.io> 2020-01-09 20:47:23 +0100
committer: Edward Welbourne <edward.welbourne@qt.io> 2020-01-30 17:58:15 +0100
commit: 4e84a8b29f13169a75c734920e953d3157768bca (patch)
tree: d8d67c2a2f10ec11a1ecdf18898727a627611622 /util/locale_database
parent: 264ed73052513015caafaf146286cf63aaa68b03 (diff)
1 files changed, 36 insertions, 22 deletions
diff --git a/util/locale_database/qlocalexml2cpp.py b/util/locale_database/qlocalexml2cpp.py
index bf58683637..e5e5cccbff 100755
--- a/util/locale_database/qlocalexml2cpp.py
+++ b/util/locale_database/qlocalexml2cpp.py
@@ -1,7 +1,7 @@
 #!/usr/bin/env python2
 #############################################################################
 ##
-## Copyright (C) 2018 The Qt Company Ltd.
+## Copyright (C) 2020 The Qt Company Ltd.
 ## Contact: https://www.qt.io/licensing/
 ##
 ## This file is part of the test suite of the Qt Toolkit.
@@ -272,32 +272,46 @@ class StringData:
         self.data = []
         self.hash = {}
         self.name = name
+        self.text = '' # Used in quick-search for matches in data
 
     def append(self, s):
-        if s in self.hash:
-            return self.hash[s]
-
-        lst = unicode2hex(s)
-        index = len(self.data)
-        if index > 65535:
-            print "\n\n\n#error Data index is too big!"
-            sys.stderr.write ("\n\n\nERROR: index exceeds the uint16 range! index = %d\n" % index)
-            sys.exit(1)
-        size = len(lst)
-        if size >= 65535:
-            print "\n\n\n#error Data is too big!"
-            sys.stderr.write ("\n\n\nERROR: data size exceeds the uint16 range! size = %d\n" % size)
-            sys.exit(1)
-        token = None
         try:
-            token = StringDataToken(index, size)
-        except Error as e:
-            sys.stderr.write("\n\n\nERROR: %s: on data '%s'" % (e, s))
-            sys.exit(1)
-        self.hash[s] = token
-        self.data += lst
+            token = self.hash[s]
+        except KeyError:
+            token = self.__store(s)
+            self.hash[s] = token
         return token
 
+    def __store(self, s):
+        """Add string s to known data.
+
+        Seeks to avoid duplication, where possible.
+        For example, short-forms may be prefixes of long-forms.
+        """
+        if not s:
+            return StringDataToken(0, 0)
+        ucs2 = unicode2hex(s)
+        try:
+            index = self.text.index(s) - 1
+            matched = 0
+            while matched < len(ucs2):
+                index, matched = self.data.index(ucs2[0], index + 1), 1
+                if index + len(ucs2) >= len(self.data):
+                    raise ValueError # not found after all !
+                while matched < len(ucs2) and self.data[index + matched] == ucs2[matched]:
+                    matched += 1
+        except ValueError:
+            index = len(self.data)
+            self.data += ucs2
+            self.text += s
+
+        assert index >= 0
+        try:
+            return StringDataToken(index, len(ucs2))
+        except ValueError as e:
+            e.args += (self.name, s)
+            raise
+
     def write(self, fd):
         fd.write("\nstatic const ushort %s[] = {\n" % self.name)
         fd.write(wrap_list(self.data))
author	Edward Welbourne <edward.welbourne@qt.io>	2020-01-09 20:47:23 +0100
committer	Edward Welbourne <edward.welbourne@qt.io>	2020-01-30 17:58:15 +0100
commit	4e84a8b29f13169a75c734920e953d3157768bca (patch)
tree	d8d67c2a2f10ec11a1ecdf18898727a627611622 /util/locale_database
parent	264ed73052513015caafaf146286cf63aaa68b03 (diff)