Add byte-based units to CLDR data

Scan CLDR for {,kilo,mega,giga,tera,peta,exa}byte forms and their IEC equivalents, providing SI and IEC defaults when missing (which all of IEC are) in addition to the usual numeric data. Extrapolate from any present data (e.g. French's ko, Mo, Go, To imply Po, Eo and, for IEC, Kio, Mio, etc.), since CLDR only goes up to tera. Propagate this data to QLocale's database ready for use by QLocale::formattedDataSize(). Change-Id: Ie6ee978948c68be9f71ab784a128cbfae3d80ee1 Reviewed-by: Shawn Rutledge <shawn.rutledge@qt.io>
2017-05-30 14:55:33 +02:00 · 2017-05-30 14:55:33 +02:00 · 424d9e9e56
commit 424d9e9e56
parent 536b918eca
5 changed files with 699 additions and 559 deletions
--- a/src/corelib/tools/qlocale_data_p.h
+++ b/src/corelib/tools/qlocale_data_p.h
--- a/src/corelib/tools/qlocale_p.h
+++ b/src/corelib/tools/qlocale_p.h
@ -300,6 +300,9 @@ public:
    quint16 m_narrow_day_names_idx, m_narrow_day_names_size;
    quint16 m_am_idx, m_am_size;
    quint16 m_pm_idx, m_pm_size;
+    quint16 m_byte_idx, m_byte_size;
+    quint16 m_byte_si_quantified_idx, m_byte_si_quantified_size;
+    quint16 m_byte_iec_quantified_idx, m_byte_iec_quantified_size;
    char    m_currency_iso_code[3];
    quint16 m_currency_symbol_idx, m_currency_symbol_size;
    quint16 m_currency_display_name_idx, m_currency_display_name_size;
--- a/util/local_database/cldr2qlocalexml.py
+++ b/util/local_database/cldr2qlocalexml.py
@ -86,6 +86,47 @@ def parse_list_pattern_part_format(pattern):
    # This is a very limited parsing of the format for list pattern part only.
    return pattern.replace("{0}", "%1").replace("{1}", "%2").replace("{2}", "%3")

+def unit_quantifiers(find, path, stem, suffix, known,
+                     # Stop at exa/exbi: 16 exbi = 2^{64} < zetta =
+                     # 1000^7 < zebi = 2^{70}, the next quantifiers up:
+                     si_quantifiers = ('kilo', 'mega', 'giga', 'tera', 'peta', 'exa')):
+    """Work out the unit quantifiers.
+
+    Unfortunately, the CLDR data only go up to terabytes and we want
+    all the way to exabytes; but we can recognize the SI quantifiers
+    as prefixes, strip and identify the tail as the localized
+    translation for 'B' (e.g. French has 'octet' for 'byte' and uses
+    ko, Mo, Go, To from which we can extrapolate Po, Eo).
+
+    Should be called first for the SI quantifiers, with suffix = 'B',
+    then for the IEC ones, with suffix = 'iB'; the list known
+    (initially empty before first call) is used to let the second call
+    know what the first learned about the localized unit.
+    """
+    if suffix == 'B': # first call, known = []
+        tail = suffix
+        for q in si_quantifiers:
+            it = find(path, stem % q)
+            # kB for kilobyte, in contrast with KiB for IEC:
+            q = q[0] if q == 'kilo' else q[0].upper()
+            if not it:
+                it = q + tail
+            elif it.startswith(q):
+                rest = it[1:]
+                tail = rest if all(rest == k for k in known) else suffix
+                known.append(rest)
+            yield it
+    else: # second call, re-using first's known
+        assert suffix == 'iB'
+        if known:
+            byte = known.pop()
+            if all(byte == k for k in known):
+                suffix = 'i' + byte
+        for q in si_quantifiers:
+            yield find(path, stem % q[:2],
+                       # Those don't (yet, v31) exist in CLDR, so we always fall back to:
+                       q[0].upper() + suffix)
+
 def generateLocaleInfo(path):
    if not path.endswith(".xml"):
        return {}
@ -261,6 +302,34 @@ def _generateLocaleInfo(path, language_code, script_code, country_code, variant_
                '[count=%s]' % x for x in ('zero', 'one', 'two', 'few', 'many', 'other')
                ]) + ';'

+    def findUnitDef(path, stem, fallback=''):
+        # The displayName for a quantified unit in en.xml is kByte
+        # instead of kB (etc.), so prefer any unitPattern provided:
+        for count in ('many', 'few', 'two', 'other', 'zero', 'one'):
+            try:
+                ans = findEntry(path, stem + 'unitPattern[count=%s]' % count)
+            except xpathlite.Error:
+                continue
+
+            # TODO: epxloit count-handling, instead of discarding placeholders
+            if ans.startswith('{0}'):
+                ans = ans[3:].lstrip()
+            if ans:
+                return ans
+
+        return findEntryDef(path, stem + 'displayName', fallback)
+
+    # First without quantifier, then quantified each way:
+    result['byte_unit'] = findEntryDef(
+        path, 'units/unitLength[type=long]/unit[type=digital-byte]/displayName',
+        'bytes')
+    stem = 'units/unitLength[type=short]/unit[type=digital-%sbyte]/'
+    known = [] # cases where we *do* have a given version:
+    result['byte_si_quantified'] = ';'.join(unit_quantifiers(findUnitDef, path, stem, 'B', known))
+    # IEC 60027-2
+    # http://physics.nist.gov/cuu/Units/binary.html
+    result['byte_iec_quantified'] = ';'.join(unit_quantifiers(findUnitDef, path, stem % '%sbi', 'iB', known))
+
    # Used for month and day data:
    namings = (
        ('standaloneLong', 'stand-alone', 'wide'),
--- a/util/local_database/localexml.py
+++ b/util/local_database/localexml.py
@ -111,6 +111,7 @@ class Locale:
    __astxt = ("language", "languageEndonym", "script", "country", "countryEndonym",
               "listPatternPartStart", "listPatternPartMiddle",
               "listPatternPartEnd", "listPatternPartTwo", "am", "pm",
+               'byte_unit', 'byte_si_quantified', 'byte_iec_quantified',
               "currencyIsoCode", "currencySymbol", "currencyDisplayName",
               "currencyFormat", "currencyNegativeFormat"
               ) + tuple(propsMonthDay())
@ -169,6 +170,7 @@ class Locale:
                    'alternateQuotationStart', 'alternateQuotationEnd',
                    'listPatternPartStart', 'listPatternPartMiddle',
                    'listPatternPartEnd', 'listPatternPartTwo',
+                    'byte_unit', 'byte_si_quantified', 'byte_iec_quantified',
                    'am', 'pm', 'firstDayOfWeek',
                    'weekendStart', 'weekendEnd',
                    'longDateFormat', 'shortDateFormat',
@ -180,7 +182,7 @@ class Locale:
                    'standaloneLongDays', 'standaloneShortDays', 'standaloneNarrowDays',
                    'currencyIsoCode', 'currencySymbol', 'currencyDisplayName',
                    'currencyFormat', 'currencyNegativeFormat'):
-            ent = camelCase(key.split('_')) if '_' in key else key
+            ent = camelCase(key.split('_')) if key.endswith('_endonym') else key
            print inner + "<%s>%s</%s>" % (ent, escape(get(key)).encode('utf-8'), ent)

        for key in ('currencyDigits', 'currencyRounding'):
@ -198,7 +200,8 @@ class Locale:
          months = ('January', 'February', 'March', 'April', 'May', 'June', 'July',
                    'August', 'September', 'October', 'November', 'December', ''),
          days = ('Sunday', 'Monday', 'Tuesday', 'Wednesday',
-                  'Thursday', 'Friday', 'Saturday', '')):
+                  'Thursday', 'Friday', 'Saturday', ''),
+          quantifiers=('k', 'M', 'G', 'T', 'P', 'E')):
        """Returns an object representing the C locale."""
        return cls(language='C', language_code='0', language_endonym='',
                   script='AnyScript', script_code='0',
@ -211,6 +214,9 @@ class Locale:
                   listPatternPartMiddle='%1, %2',
                   listPatternPartEnd='%1, %2',
                   listPatternPartTwo='%1, %2',
+                   byte_unit='bytes',
+                   byte_si_quantified=';'.join(q + 'B' for q in quantifiers),
+                   byte_iec_quantified=';'.join(q.upper() + 'iB' for q in quantifiers),
                   am='AM', pm='PM', firstDayOfWeek='mon',
                   weekendStart='sat', weekendEnd='sun',
                   longDateFormat='EEEE, d MMMM yyyy', shortDateFormat='d MMM yyyy',
--- a/util/local_database/qlocalexml2cpp.py
+++ b/util/local_database/qlocalexml2cpp.py
@ -445,6 +445,7 @@ def main():
    days_data = StringData('days_data')
    am_data = StringData('am_data')
    pm_data = StringData('pm_data')
+    byte_unit_data = StringData('byte_unit_data')
    currency_symbol_data = StringData('currency_symbol_data')
    currency_display_name_data = StringData('currency_display_name_data')
    currency_format_data = StringData('currency_format_data')
@ -494,6 +495,10 @@ def main():
                         + '    nDays   '
                         + '     am     ' # am/pm indicators
                         + '     pm     '
+                         # Width 8 + comma
+                         + '  byte   '
+                         + ' siQuant '
+                         + 'iecQuant '
                         # Width 8+4 + comma
                         + '   currISO   '
                         # Width 11 + comma:
@ -527,6 +532,8 @@ def main():
                   + '%8d,' * 4
                   # List patterns, date/time formats, month/day names, am/pm:
                   + '%11s,' * 22
+                   # SI/IEC byte-unit abbreviations:
+                   + '%8s,' * 3
                   # Currency ISO code:
                   + ' %10s, '
                   # Currency and endonyms
@ -574,6 +581,9 @@ def main():
                        days_data.append(l.narrowDays),
                        am_data.append(l.am),
                        pm_data.append(l.pm),
+                        byte_unit_data.append(l.byte_unit),
+                        byte_unit_data.append(l.byte_si_quantified),
+                        byte_unit_data.append(l.byte_iec_quantified),
                        currencyIsoCodeData(l.currencyIsoCode),
                        currency_symbol_data.append(l.currencySymbol),
                        currency_display_name_data.append(l.currencyDisplayName),
@ -588,7 +598,7 @@ def main():
                        l.weekendEnd)
                             + ", // %s/%s/%s\n" % (l.language, l.script, l.country))
    data_temp_file.write(line_format # All zeros, matching the format:
-                         % ( (0,) * (3 + 8 + 4) + ("0,0",) * 22
+                         % ( (0,) * (3 + 8 + 4) + ("0,0",) * (22 + 3)
                             + (currencyIsoCodeData(0),)
                             + ("0,0",) * 6 + (0,) * (2 + 3))
                         + " // trailing 0s\n")
@ -597,7 +607,7 @@ def main():
    # StringData tables:
    for data in (list_pattern_part_data, date_format_data,
                 time_format_data, months_data, days_data,
-                 am_data, pm_data, currency_symbol_data,
+                 byte_unit_data, am_data, pm_data, currency_symbol_data,
                 currency_display_name_data, currency_format_data,
                 endonyms_data):
        data_temp_file.write("\nstatic const ushort %s[] = {\n" % data.name)