Deduplicate locale data tables

Some entries in tables were sub-strings (e.g. prefixes) of others. Since we store start-index and length (with no need for terminators), any entry that appears as a sub-string of an earlier entry can be recorded without making a separate copy of its content, just by recording where it appeared as a sub-string of an earlier entry. (Sadly this doesn't apply to month- or day-names and their short-forms: for those, we store ';'-joined lists. Thus, although each short-form is a prefix of its long-form, the short-form is stored in a list with other short-forms; and this is not a prefix of the list of matching long-forms.) The savings are modest (780 bytes at present), but cost us nothing except when running the python script that generates the data files (it takes a little longer now), which usually only happens at a CLDR update. Change-Id: I05bdaa9283365707bac0190ae983b31f074dd6ed Reviewed-by: Lars Knoll <lars.knoll@qt.io>
2020-01-09 20:47:23 +01:00 · 2020-01-09 20:47:23 +01:00 · 4e84a8b29f
commit 4e84a8b29f
parent 264ed73052
5 changed files with 5458 additions and 5464 deletions
--- a/src/corelib/text/qlocale_data_p.h
+++ b/src/corelib/text/qlocale_data_p.h
--- a/src/corelib/time/qhijricalendar_data_p.h
+++ b/src/corelib/time/qhijricalendar_data_p.h
--- a/src/corelib/time/qjalalicalendar_data_p.h
+++ b/src/corelib/time/qjalalicalendar_data_p.h
--- a/src/corelib/time/qromancalendar_data_p.h
+++ b/src/corelib/time/qromancalendar_data_p.h
--- a/util/locale_database/qlocalexml2cpp.py
+++ b/util/locale_database/qlocalexml2cpp.py
@ -1,7 +1,7 @@
 #!/usr/bin/env python2
 #############################################################################
 ##
-## Copyright (C) 2018 The Qt Company Ltd.
+## Copyright (C) 2020 The Qt Company Ltd.
 ## Contact: https://www.qt.io/licensing/
 ##
 ## This file is part of the test suite of the Qt Toolkit.
@ -272,32 +272,46 @@ class StringData:
        self.data = []
        self.hash = {}
        self.name = name
+        self.text = '' # Used in quick-search for matches in data

    def append(self, s):
-        if s in self.hash:
-            return self.hash[s]
-
-        lst = unicode2hex(s)
-        index = len(self.data)
-        if index > 65535:
-            print "\n\n\n#error Data index is too big!"
-            sys.stderr.write ("\n\n\nERROR: index exceeds the uint16 range! index = %d\n" % index)
-            sys.exit(1)
-        size = len(lst)
-        if size >= 65535:
-            print "\n\n\n#error Data is too big!"
-            sys.stderr.write ("\n\n\nERROR: data size exceeds the uint16 range! size = %d\n" % size)
-            sys.exit(1)
-        token = None
        try:
-            token = StringDataToken(index, size)
-        except Error as e:
-            sys.stderr.write("\n\n\nERROR: %s: on data '%s'" % (e, s))
-            sys.exit(1)
-        self.hash[s] = token
-        self.data += lst
+            token = self.hash[s]
+        except KeyError:
+            token = self.__store(s)
+            self.hash[s] = token
        return token

+    def __store(self, s):
+        """Add string s to known data.
+
+        Seeks to avoid duplication, where possible.
+        For example, short-forms may be prefixes of long-forms.
+        """
+        if not s:
+            return StringDataToken(0, 0)
+        ucs2 = unicode2hex(s)
+        try:
+            index = self.text.index(s) - 1
+            matched = 0
+            while matched < len(ucs2):
+                index, matched = self.data.index(ucs2[0], index + 1), 1
+                if index + len(ucs2) >= len(self.data):
+                    raise ValueError # not found after all !
+                while matched < len(ucs2) and self.data[index + matched] == ucs2[matched]:
+                    matched += 1
+        except ValueError:
+            index = len(self.data)
+            self.data += ucs2
+            self.text += s
+
+        assert index >= 0
+        try:
+            return StringDataToken(index, len(ucs2))
+        except ValueError as e:
+            e.args += (self.name, s)
+            raise
+
    def write(self, fd):
        fd.write("\nstatic const ushort %s[] = {\n" % self.name)
        fd.write(wrap_list(self.data))