diff --git a/util/locale_database/enumdata.py b/util/locale_database/enumdata.py index 17279e6e26e..3a52ae35fc4 100644 --- a/util/locale_database/enumdata.py +++ b/util/locale_database/enumdata.py @@ -6,14 +6,18 @@ # can find a name (taken always from en.xml) that could potentially be # used. There is no point adding a mapping for such a code unless the # CLDR's common/main/ contains an XML file for at least one locale -# that exerciss it. +# that exercises it (and little point absent substantial data). -# Each *_list reflects the current values of its enums in qlocale.h; -# if new xml language files are available in CLDR, these languages and +# Each *_map reflects the current values of its enums in qlocale.h; if +# new xml language files are available in CLDR, these languages and # territories need to be *appended* to this list (for compatibility -# between versions). Include any spaces present in names (scripts -# shall squish them out for the enum entries) in *_list, but use the -# squished forms of names in the *_aliases mappings. +# between versions). Include any spaces and dashes present in names +# (they'll be squished them out for the enum entries) in *_map, but +# use the squished forms of names in the *_aliases mappings. The +# squishing also turns the first letter of each word into a capital so +# you can safely preserve the case of en.xml's name; but omit (or +# replace with space) any punctuation aside from dashes and map any +# accented letters to their un-accented plain ASCII. # For a new major version (and only then), we can change the # numbering, so re-sort each list into alphabetic order (e.g. using @@ -21,10 +25,10 @@ # are offset with a blank line, below. After doing that, regenerate # locale data as usual; this will cause a binary-incompatible change. -# Note on "macrolanguage" comments: see "ISO 639 macrolanguage" on -# Wikipedia. A "macrolanguage" is (loosely-speaking) a group of -# languages so closely related to one another that they could also be -# regarded as divergent dialects of the macrolanguage. +# Note on "macrolanguage" comments: see QTBUG-107781 and "ISO 639 +# macrolanguage" on Wikipedia. A "macrolanguage" is (loosely-speaking) +# a group of languages so closely related to one another that they +# could also be regarded as divergent dialects of the macrolanguage. language_map = { 0: ("AnyLanguage", " "), diff --git a/util/locale_database/qlocalexml.py b/util/locale_database/qlocalexml.py index e63e8d4c98b..d9a2e13cf19 100644 --- a/util/locale_database/qlocalexml.py +++ b/util/locale_database/qlocalexml.py @@ -114,7 +114,7 @@ class QLocaleXmlReader (object): self.__textByName = dict((v[1], (v[0], v[2])) for v in scripts) self.__landByName = dict((v[1], (v[0], v[2])) for v in territories) # Other properties: - self.dupes = set(v[1] for v in languages) & set(v[1] for v in territories) + self.__dupes = set(v[1] for v in languages) & set(v[1] for v in territories) self.cldrVersion = self.__firstChildText(self.root, "version") def loadLocaleMap(self, calendars, grumble = lambda text: None): @@ -184,6 +184,32 @@ class QLocaleXmlReader (object): self.__textByName[give[1]][0]), self.__landByName[give[2]][0]) + def enumify(self, name, suffix): + """Stick together the parts of an enumdata.py name. + + Names given in enumdata.py include spaces and hyphens that we + can't include in an identifier, such as the name of a member + of an enum type. Removing those would lose the word + boundaries, so make sure each word starts with a capital (but + don't simply capitalize() as some names contain words, + e.g. McDonald, that have later capitals in them). + + We also need to resolve duplication between languages and + territories (by adding a suffix to each) and add Script to the + ends of script-names that don't already end in it.""" + name = name.replace('-', ' ') + # Don't .capitalize() as McDonald is already camel-case (see enumdata.py): + name = ''.join(word[0].upper() + word[1:] for word in name.split()) + if suffix != 'Script': + assert not(name in self.__dupes and name.endswith(suffix)) + return name + suffix if name in self.__dupes else name + + if not name.endswith(suffix): + name += suffix + if name in self.__dupes: + raise Error(f'The script name "{name}" is messy') + return name + # Implementation details: def __loadMap(self, category): kid = self.__firstChildText diff --git a/util/locale_database/qlocalexml2cpp.py b/util/locale_database/qlocalexml2cpp.py index 137dec80eec..cfb3e2e4326 100755 --- a/util/locale_database/qlocalexml2cpp.py +++ b/util/locale_database/qlocalexml2cpp.py @@ -456,9 +456,9 @@ class CalendarDataWriter (LocaleSourceEditor): months_data.write(self.writer) class LocaleHeaderWriter (SourceFileEditor): - def __init__(self, path, temp, dupes): + def __init__(self, path, temp, enumify): super().__init__(path, temp) - self.__dupes = dupes + self.__enumify = enumify def languages(self, languages): self.__enum('Language', languages, self.__language) @@ -483,20 +483,10 @@ class LocaleHeaderWriter (SourceFileEditor): if suffix is None: suffix = name - out, dupes = self.writer.write, self.__dupes + out, enumify = self.writer.write, self.__enumify out(f' enum {name} : ushort {{\n') for key, value in book.items(): - member = value[0].replace('-', ' ') - if name == 'Script': - # Don't .capitalize() as some names are already camel-case (see enumdata.py): - member = ''.join(word[0].upper() + word[1:] for word in member.split()) - if not member.endswith('Script'): - member += 'Script' - if member in dupes: - raise Error(f'The script name "{member}" is messy') - else: - member = ''.join(member.split()) - member = member + suffix if member in dupes else member + member = enumify(value[0], suffix) out(f' {member} = {key},\n') out('\n ' @@ -581,7 +571,7 @@ def main(out, err): # qlocale.h try: with LocaleHeaderWriter(qtsrcdir.joinpath('src/corelib/text/qlocale.h'), - qtsrcdir, reader.dupes) as writer: + qtsrcdir, reader.enumify) as writer: writer.languages(reader.languages) writer.scripts(reader.scripts) writer.territories(reader.territories)