Derive C locale data from en_US, overriding minor details

The qlocalexml.py Locale.C() had to replicate a whole lot of data that isn't really relevant to how C differs from en_US and every addition to what we support required further additions to it. So pass the en_US Locale object to the pseudoconstructor so that C can inherit from it and only override the parts where we care about the difference. Hand-code shortening for short Jalali month names, to match Soroush's original contribution, and include the narrow forms in the hard-coded data to keep the generated data unchanged (for now). Note some of the departures from CLDR; we may want to drop these overrides later. In the process, convert the mapping from keys to locales to consistently use IDs for all members of the key, instead of using the (empty) code value for (as yet unused) variant; it now gets ID 0 and is consistent with returns from codesToIdNames(). This makes life easier for the code that now has to construct an en_US key. Task-number: QTBUG-115158 Change-Id: I3d7acb6a4059daec1bba341fcf015c39c7a6803b Reviewed-by: Kai Köhne <kai.koehne@qt.io>
2024-04-26 12:27:10 +02:00 · 2024-04-26 12:27:10 +02:00 · 0c809fc3b5
commit 0c809fc3b5
parent 5641b17e2f
3 changed files with 50 additions and 91 deletions
--- a/util/locale_database/cldr.py
+++ b/util/locale_database/cldr.py
@ -146,7 +146,7 @@ class CldrReader (object):
        return alias, defaults, winIds

    def readLocales(self, calendars = ('gregorian',)):
-        return {(k.language_id, k.script_id, k.territory_id, k.variant_code): k
+        return {(k.language_id, k.script_id, k.territory_id, k.variant_id): k
                for k in self.__allLocales(calendars)}

    def __allLocales(self, calendars):
@ -264,7 +264,7 @@ class CldrReader (object):
            language = names[0], language_code = language, language_id = ids[0],
            script = names[1], script_code = script, script_id = ids[1],
            territory = names[2], territory_code = territory, territory_id = ids[2],
-            variant_code = variant)
+            variant_code = variant, variant_id = ids[3])

        firstDay, weStart, weEnd = self.root.weekData(territory)
        assert all(day in ('mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun')
--- a/util/locale_database/cldr2qlocalexml.py
+++ b/util/locale_database/cldr2qlocalexml.py
@ -91,7 +91,8 @@ def main(argv, out, err):
    writer.enumData(reader.root.englishNaming)
    writer.likelySubTags(reader.likelySubTags())
    writer.zoneData(*reader.zoneData()) # Locale-independent zone data.
-    writer.locales(reader.readLocales(args.calendars), args.calendars)
+    en_US = tuple(id for id, name in reader.root.codesToIdName('en', '', 'US'))
+    writer.locales(reader.readLocales(args.calendars), args.calendars, en_US)

    writer.close(err.write)
    return 0
--- a/util/locale_database/qlocalexml.py
+++ b/util/locale_database/qlocalexml.py
@ -372,10 +372,19 @@ class QLocaleXmlWriter (object):
            self.__closeTag('msZoneIana')
        self.__closeTag('windowsZone')

-    def locales(self, locales, calendars):
+    def locales(self, locales, calendars, en_US):
+        """Write the data for each locale.
+
+        First argument, locales, is the mapping whose values are the
+        Locale objects, with each key being the matching tuple of
+        numeric IDs for language, script, territory and variant.
+        Second argument is a tuple of calendar names. Third is the
+        tuple of numeric IDs that corresponds to en_US (needed to
+        provide fallbacks for the C locale)."""
+
        self.__openTag('localeList')
        self.__openTag('locale')
-        self.__writeLocale(Locale.C(calendars), calendars)
+        self.__writeLocale(Locale.C(locales[en_US]), calendars)
        self.__closeTag('locale')
        for key in sorted(locales.keys()):
            self.__openTag('locale')
@ -575,97 +584,46 @@ class Locale (object):
        for key in ('currencyDigits', 'currencyRounding'):
            write(key, get(key))

-    # Tools used by __monthNames:
-    def fullName(i, name): return name
-    def firstThree(i, name): return name[:3]
-    def initial(i, name): return name[:1]
-    def number(i, name): return str(i + 1)
-    def islamicShort(i, name):
-        if not name: return name
-        if name == 'Shawwal': return 'Shaw.'
-        words = name.split()
-        if words[0].startswith('Dhu'):
-            words[0] = words[0][:7] + '.'
-        elif len(words[0]) > 3:
-            words[0] = words[0][:3] + '.'
-        return ' '.join(words)
-    @staticmethod
-    def __monthNames(calendars,
-                     known={ # Map calendar to (names, extractors...):
-            # TODO: do we even need these ?  CLDR's root.xml seems to
-            # have them, complete with yeartype="leap" handling for
-            # Hebrew's extra.
-            'gregorian': (('January', 'February', 'March', 'April', 'May', 'June', 'July',
-                           'August', 'September', 'October', 'November', 'December'),
-                          # Extractor pairs, (plain, standalone)
-                          (fullName, fullName), # long
-                          (firstThree, firstThree), # short
-                          (number, initial)), # narrow
-            'persian': (('Farvardin', 'Ordibehesht', 'Khordad', 'Tir', 'Mordad',
-                         'Shahrivar', 'Mehr', 'Aban', 'Azar', 'Dey', 'Bahman', 'Esfand'),
-                        (fullName, fullName),
-                        (firstThree, firstThree),
-                        (number, initial)),
-            'islamic': (('Muharram', 'Safar', 'Rabiʻ I', 'Rabiʻ II', 'Jumada I',
-                         'Jumada II', 'Rajab', 'Shaʻban', 'Ramadan', 'Shawwal',
-                         'Dhuʻl-Qiʻdah', 'Dhuʻl-Hijjah'),
-                        (fullName, fullName),
-                        (islamicShort, islamicShort),
-                        (number, number)),
-            'hebrew': (('Tishri', 'Heshvan', 'Kislev', 'Tevet', 'Shevat', 'Adar I',
-                        'Adar', 'Nisan', 'Iyar', 'Sivan', 'Tamuz', 'Av'),
-                       (fullName, fullName),
-                       (fullName, fullName),
-                       (number, number)),
-                     },
-                     sizes=('long', 'short', 'narrow')):
-        for cal in calendars:
-            try:
-                data = known[cal]
-            except KeyError as e: # Need to add an entry to known, above.
-                e.args += ('Unsupported calendar:', cal)
-                raise
-            names, get = data[0], data[1:]
-            for n, size in enumerate(sizes):
-                yield ('_'.join((camelCase((size, 'months')), cal)),
-                       ';'.join(get[n][0](i, x) for i, x in enumerate(names)))
-                yield ('_'.join((camelCase(('standalone', size, 'months')), cal)),
-                       ';'.join(get[n][1](i, x) for i, x in enumerate(names)))
-    del fullName, firstThree, initial, number, islamicShort
-
    @classmethod
-    def C(cls, calendars=('gregorian',),
-          days = ('Sunday', 'Monday', 'Tuesday', 'Wednesday',
-                  'Thursday', 'Friday', 'Saturday'),
-          quantifiers=('k', 'M', 'G', 'T', 'P', 'E')):
-        """Returns an object representing the C locale."""
-        return cls(cls.__monthNames(calendars),
+    def C(cls, en_US):
+        """Returns an object representing the C locale.
+
+        Required argument, en_US, is the corresponding object for the
+        en_US locale (or the en_US_POSIX one if we ever support
+        variants). The C locale inherits from this, overriding what it
+        may need to."""
+        base = en_US.__dict__.copy()
+        # Soroush's original contribution shortened Jalali month names
+        # - contrary to CLDR, which doesn't abbreviate these in
+        # root.xml or en.xml, although some locales do, e.g. fr_CA.
+        # For compatibility with that,
+        for k in ('shortMonths_persian', 'standaloneShortMonths_persian'):
+            base[k] = ';'.join(x[:3] for x in base[k].split(';'))
+
+        return cls(base,
                   language='C', language_code='0', languageEndonym='',
                   script='AnyScript', script_code='0',
                   territory='AnyTerritory', territory_code='0', territoryEndonym='',
-                   groupSizes=(3, 3, 1),
-                   decimal='.', group=',', list=';', percent='%',
-                   zero='0', minus='-', plus='+', exp='e',
+                   # CLDR has non-ASCII versions of these:
                   quotationStart='"', quotationEnd='"',
-                   alternateQuotationStart='\'', alternateQuotationEnd='\'',
-                   listPatternPartStart='%1, %2',
-                   listPatternPartMiddle='%1, %2',
-                   listPatternPartEnd='%1, %2',
-                   listPatternPartTwo='%1, %2',
-                   byte_unit='bytes',
-                   byte_si_quantified=';'.join(q + 'B' for q in quantifiers),
-                   byte_iec_quantified=';'.join(q.upper() + 'iB' for q in quantifiers),
-                   am='AM', pm='PM', firstDayOfWeek='mon',
-                   weekendStart='sat', weekendEnd='sun',
+                   alternateQuotationStart="'", alternateQuotationEnd="'",
+                   # CLDR gives 'dddd, MMMM d, yyyy', 'M/d/yy', 'h:mm:ss Ap tttt',
+                   # 'h:mm Ap' with non-breaking space before Ap.
                   longDateFormat='dddd, d MMMM yyyy', shortDateFormat='d MMM yyyy',
                   longTimeFormat='HH:mm:ss t', shortTimeFormat='HH:mm:ss',
-                   longDays=';'.join(days),
-                   shortDays=';'.join(d[:3] for d in days),
-                   narrowDays='7;1;2;3;4;5;6',
-                   standaloneLongDays=';'.join(days),
-                   standaloneShortDays=';'.join(d[:3] for d in days),
-                   standaloneNarrowDays=';'.join(d[:1] for d in days),
-                   currencyIsoCode='', currencySymbol='',
-                   currencyDisplayName='',
+                   # CLDR has US-$ and US-style formats:
+                   currencyIsoCode='', currencySymbol='', currencyDisplayName='',
                   currencyDigits=2, currencyRounding=1,
-                   currencyFormat='%1%2', currencyNegativeFormat='')
+                   currencyFormat='%1%2', currencyNegativeFormat='',
+                   # We may want to fall back to CLDR for some of these:
+                   firstDayOfWeek='mon', # CLDR has 'sun'
+                   exp='e', # CLDR has 'E'
+                   listPatternPartEnd='%1, %2', # CLDR has '%1, and %2'
+                   listPatternPartTwo='%1, %2', # CLDR has '%1 and %2'
+                   narrowDays='7;1;2;3;4;5;6', # CLDR has letters
+                   narrowMonths_gregorian='1;2;3;4;5;6;7;8;9;10;11;12', # CLDR has letters
+                   standaloneNarrowMonths_persian='F;O;K;T;M;S;M;A;A;D;B;E', # CLDR has digits
+                   # Keep these explicit, despite matching CLDR:
+                   decimal='.', group=',', percent='%',
+                   zero='0', minus='-', plus='+',
+                   am='AM', pm='PM', weekendStart='sat', weekendEnd='sun')