Move enum-name-munging from LocaleHeaderWriter to QLocaleXmlReader

The former needed the latter's .dupes to do the job, so can now just take a method as a tool to do the job instead, letting .dupes become private. In the process refine the munging to free enumdata.py from having to capitalize each word in its names. This will, in due course, let us use more natural forms in various comments. This causes no change to generted data. Update enumdata.py's introduction doc, mainly to reflect this but also fixing the out-of-date names (old *_list have long been *_map) and adding some details to other paragraphs. Task-number: QTBUG-94460 Change-Id: If195b2e94a53a495fc4f1f216bed07a910439fa7 Reviewed-by: Ievgenii Meshcheriakov <ievgenii.meshcheriakov@qt.io>
author: Edward Welbourne <edward.welbourne@qt.io> 2023-08-01 12:03:18 +0200
committer: Edward Welbourne <edward.welbourne@qt.io> 2023-08-09 17:53:26 +0200
commit: 743ceb7cc29236d69bb5fe7c8eedd49cb2eff589 (patch)
tree: 42c50da44bde33770d64d2c6c2a92abc82b460be
parent: e212b3633cbfe15947e0e8059fc82c279867828a (diff)
3 files changed, 46 insertions, 26 deletions
diff --git a/util/locale_database/enumdata.py b/util/locale_database/enumdata.py
index 17279e6e26..3a52ae35fc 100644
--- a/util/locale_database/enumdata.py
+++ b/util/locale_database/enumdata.py
@@ -6,14 +6,18 @@
 # can find a name (taken always from en.xml) that could potentially be
 # used. There is no point adding a mapping for such a code unless the
 # CLDR's common/main/ contains an XML file for at least one locale
-# that exerciss it.
+# that exercises it (and little point absent substantial data).
 
-# Each *_list reflects the current values of its enums in qlocale.h;
-# if new xml language files are available in CLDR, these languages and
+# Each *_map reflects the current values of its enums in qlocale.h; if
+# new xml language files are available in CLDR, these languages and
 # territories need to be *appended* to this list (for compatibility
-# between versions).  Include any spaces present in names (scripts
-# shall squish them out for the enum entries) in *_list, but use the
-# squished forms of names in the *_aliases mappings.
+# between versions). Include any spaces and dashes present in names
+# (they'll be squished them out for the enum entries) in *_map, but
+# use the squished forms of names in the *_aliases mappings. The
+# squishing also turns the first letter of each word into a capital so
+# you can safely preserve the case of en.xml's name; but omit (or
+# replace with space) any punctuation aside from dashes and map any
+# accented letters to their un-accented plain ASCII.
 
 # For a new major version (and only then), we can change the
 # numbering, so re-sort each list into alphabetic order (e.g. using
@@ -21,10 +25,10 @@
 # are offset with a blank line, below. After doing that, regenerate
 # locale data as usual; this will cause a binary-incompatible change.
 
-# Note on "macrolanguage" comments: see "ISO 639 macrolanguage" on
-# Wikipedia. A "macrolanguage" is (loosely-speaking) a group of
-# languages so closely related to one another that they could also be
-# regarded as divergent dialects of the macrolanguage.
+# Note on "macrolanguage" comments: see QTBUG-107781 and "ISO 639
+# macrolanguage" on Wikipedia. A "macrolanguage" is (loosely-speaking)
+# a group of languages so closely related to one another that they
+# could also be regarded as divergent dialects of the macrolanguage.
 
 language_map = {
       0: ("AnyLanguage",                 "  "),
diff --git a/util/locale_database/qlocalexml.py b/util/locale_database/qlocalexml.py
index e63e8d4c98..d9a2e13cf1 100644
--- a/util/locale_database/qlocalexml.py
+++ b/util/locale_database/qlocalexml.py
@@ -114,7 +114,7 @@ class QLocaleXmlReader (object):
         self.__textByName = dict((v[1], (v[0], v[2])) for v in scripts)
         self.__landByName = dict((v[1], (v[0], v[2])) for v in territories)
         # Other properties:
-        self.dupes = set(v[1] for v in languages) & set(v[1] for v in territories)
+        self.__dupes = set(v[1] for v in languages) & set(v[1] for v in territories)
         self.cldrVersion = self.__firstChildText(self.root, "version")
 
     def loadLocaleMap(self, calendars, grumble = lambda text: None):
@@ -184,6 +184,32 @@ class QLocaleXmlReader (object):
                         self.__textByName[give[1]][0]),
                        self.__landByName[give[2]][0])
 
+    def enumify(self, name, suffix):
+        """Stick together the parts of an enumdata.py name.
+
+        Names given in enumdata.py include spaces and hyphens that we
+        can't include in an identifier, such as the name of a member
+        of an enum type. Removing those would lose the word
+        boundaries, so make sure each word starts with a capital (but
+        don't simply capitalize() as some names contain words,
+        e.g. McDonald, that have later capitals in them).
+
+        We also need to resolve duplication between languages and
+        territories (by adding a suffix to each) and add Script to the
+        ends of script-names that don't already end in it."""
+        name = name.replace('-', ' ')
+        # Don't .capitalize() as McDonald is already camel-case (see enumdata.py):
+        name = ''.join(word[0].upper() + word[1:] for word in name.split())
+        if suffix != 'Script':
+            assert not(name in self.__dupes and name.endswith(suffix))
+            return name + suffix if name in self.__dupes else name
+
+        if not name.endswith(suffix):
+            name += suffix
+        if name in self.__dupes:
+            raise Error(f'The script name "{name}" is messy')
+        return name
+
     # Implementation details:
     def __loadMap(self, category):
         kid = self.__firstChildText
diff --git a/util/locale_database/qlocalexml2cpp.py b/util/locale_database/qlocalexml2cpp.py
index 137dec80ee..cfb3e2e432 100755
--- a/util/locale_database/qlocalexml2cpp.py
+++ b/util/locale_database/qlocalexml2cpp.py
@@ -456,9 +456,9 @@ class CalendarDataWriter (LocaleSourceEditor):
         months_data.write(self.writer)
 
 class LocaleHeaderWriter (SourceFileEditor):
-    def __init__(self, path, temp, dupes):
+    def __init__(self, path, temp, enumify):
         super().__init__(path, temp)
-        self.__dupes = dupes
+        self.__enumify = enumify
 
     def languages(self, languages):
         self.__enum('Language', languages, self.__language)
@@ -483,20 +483,10 @@ class LocaleHeaderWriter (SourceFileEditor):
         if suffix is None:
             suffix = name
 
-        out, dupes = self.writer.write, self.__dupes
+        out, enumify = self.writer.write, self.__enumify
         out(f'    enum {name} : ushort {{\n')
         for key, value in book.items():
-            member = value[0].replace('-', ' ')
-            if name == 'Script':
-                # Don't .capitalize() as some names are already camel-case (see enumdata.py):
-                member = ''.join(word[0].upper() + word[1:] for word in member.split())
-                if not member.endswith('Script'):
-                    member += 'Script'
-                if member in dupes:
-                    raise Error(f'The script name "{member}" is messy')
-            else:
-                member = ''.join(member.split())
-                member = member + suffix if member in dupes else member
+            member = enumify(value[0], suffix)
             out(f'        {member} = {key},\n')
 
         out('\n        '
@@ -581,7 +571,7 @@ def main(out, err):
     # qlocale.h
     try:
         with LocaleHeaderWriter(qtsrcdir.joinpath('src/corelib/text/qlocale.h'),
-                                qtsrcdir, reader.dupes) as writer:
+                                qtsrcdir, reader.enumify) as writer:
             writer.languages(reader.languages)
             writer.scripts(reader.scripts)
             writer.territories(reader.territories)
author	Edward Welbourne <edward.welbourne@qt.io>	2023-08-01 12:03:18 +0200
committer	Edward Welbourne <edward.welbourne@qt.io>	2023-08-09 17:53:26 +0200
commit	743ceb7cc29236d69bb5fe7c8eedd49cb2eff589 (patch)
tree	42c50da44bde33770d64d2c6c2a92abc82b460be
parent	e212b3633cbfe15947e0e8059fc82c279867828a (diff)