Rewrite CLDR-ingestion's date-time format conversion

Rework the somewhat ad-hoc handling of format blocks. Instead of converting one character at a time, then coming back to map contiguous chunks of various lengths to Qt's best match, use the first non-separator character to select a function that looks ahead to see what to consume with it. Quoted text can be handled the same way, with a look-ahead. This potentially allows for more flexible parsing in future. In the process, matching qlocale_mac.mm, treat all unquoted letters as reserved. The LDML spec says: Currently, A..Z and a..z are reserved for use as pattern characters (unless they are quoted, see next item). and its description of literal text explcitly says these reserved characters are not to be understood as literals. Document the letters we do know about as unsupported pattern characters, but don't do anything specific to handle them. This transiently changes zh_TW's "Bh" hour fields to plain "h" but an imminent commit will change that again and there is no other change to data, so the locale data is not regenerated in this commit, to save churn. This makes the parsing front-end function more straightforward and makes it easier to document the quirks of the different format letters and the impedance mismatches between CLDR's and Qt's. In the process, recognize C, like j and J, as special magic to ignore and harmonize with what qlocale_mac.cpp's macToQtFormat() does, where it's right and dateconverter.py differed. Document the need to stay in sync with this last. Task-number: QTBUG-123872 Change-Id: I490d395b37751c9b8d6f3ee5ed4edbc0d405db5b Reviewed-by: Mate Barany <mate.barany@qt.io> (cherry picked from commit ea806fa3f15061d75d2c2c40144588053e2217b0) Reviewed-by: Friedemann Kleint <Friedemann.Kleint@qt.io>
2024-04-02 18:05:21 +02:00 · 2024-04-02 18:05:21 +02:00 · 1119a3775b
commit 1119a3775b
parent e78b7e7010
1 changed files with 165 additions and 76 deletions
--- a/util/locale_database/dateconverter.py
+++ b/util/locale_database/dateconverter.py
@ -1,81 +1,170 @@
 # Copyright (C) 2016 The Qt Company Ltd.
 # SPDX-License-Identifier: LicenseRef-Qt-Commercial OR GPL-3.0-only WITH Qt-GPL-exception-1.0

-import re
+class Converter (object):
+    """Conversion between CLDR and Qt datetime formats.

-def _convert_pattern(pattern):
-    # patterns from http://www.unicode.org/reports/tr35/#Date_Format_Patterns
-    qt_regexps = {
-        r"yyy{3,}" : "yyyy", # more that three digits hence convert to four-digit year
-        r"L" : "M",          # stand-alone month names. not supported.
-        r"g{1,}": "",        # modified julian day. not supported.
-        r"S{1,}" : "",       # fractional seconds. not supported.
-        r"A{1,}" : ""        # milliseconds in day. not supported.
-    }
-    qt_patterns = {
-        "G" : "", "GG" : "", "GGG" : "", "GGGG" : "", "GGGGG" : "", # Era. not supported.
-        "y" : "yyyy", # four-digit year without leading zeroes
-        "Q" : "", "QQ" : "", "QQQ" : "", "QQQQ" : "", # quarter. not supported.
-        "q" : "", "qq" : "", "qqq" : "", "qqqq" : "", # quarter. not supported.
-        "MMMMM" : "MMM", # narrow month name.
-        "LLLLL" : "MMM", # stand-alone narrow month name.
-        "l" : "", # special symbol for chinese leap month. not supported.
-        "w" : "", "W" : "", # week of year/month. not supported.
-        "D" : "", "DD" : "", "DDD" : "", # day of year. not supported.
-        "F" : "", # day of week in month. not supported.
-        "E" : "ddd", "EE" : "ddd", "EEE" : "ddd", "EEEEE" : "ddd", "EEEE" : "dddd", # day of week
-        "e" : "ddd", "ee" : "ddd", "eee" : "ddd", "eeeee" : "ddd", "eeee" : "dddd", # local day of week
-        "c" : "ddd", "cc" : "ddd", "ccc" : "ddd", "ccccc" : "ddd", "cccc" : "dddd", # stand-alone local day of week
-        "a" : "AP", # AM/PM
-        "K" : "h", # Hour 0-11
-        "k" : "H", # Hour 1-24
-        "j" : "", # special reserved symbol.
-        "z" : "t", "zz" : "t", "zzz" : "t", "zzzz" : "t", # timezone
-        "Z" : "t", "ZZ" : "t", "ZZZ" : "t", "ZZZZ" : "t", # timezone
-        "v" : "t", "vv" : "t", "vvv" : "t", "vvvv" : "t", # timezone
-        "V" : "t", "VV" : "t", "VVV" : "t", "VVVV" : "t"  # timezone
-    }
-    if pattern in qt_patterns:
-        return qt_patterns[pattern]
-    for r,v in qt_regexps.items():
-        pattern = re.sub(r, v, pattern)
-    return pattern
+    Keep in sync with qlocale_mac.mm's macToQtFormat().
+    The definitive source of truth is:
+    https://www.unicode.org/reports/tr35/tr35-68/tr35-dates.html#Date_Field_Symbol_Table

-def convert_date(input):
-    result = ""
-    patterns = "GyYuQqMLlwWdDFgEecahHKkjmsSAzZvV"
-    last = ""
-    inquote = 0
-    chars_to_strip = " -"
-    for c in input:
-        if c == "'":
-            inquote = inquote + 1
-        if inquote % 2 == 0:
-            if c in patterns:
-                if not last:
-                    last = c
-                else:
-                    if c in last:
-                        last += c
-                    else:
-                        # pattern changed
-                        converted = _convert_pattern(last)
-                        result += converted
-                        if not converted:
-                            result = result.rstrip(chars_to_strip)
-                        last = c
-                continue
-        if last:
-            # pattern ended
-            converted = _convert_pattern(last)
-            result += converted
-            if not converted:
-                result = result.rstrip(chars_to_strip)
-            last = ""
-        result += c
-    if last:
-        converted = _convert_pattern(last)
-        result += converted
-        if not converted:
-            result = result.rstrip(chars_to_strip)
-    return result.lstrip(chars_to_strip)
+    See convert() for explanation of the approach taken. Each method
+    with a single-letter name is used to scan a prefix of a text,
+    presumed to begin with that letter (or one Qt treats as equivalent
+    to it) and returns a pair (Qt format, length), to use the given Qt
+    format in place of text[:length]. In all cases, length must be
+    positive."""
+
+    @staticmethod
+    def __is_reserved(ch):
+        """Every ASCII letter is a reserved symbol in CLDR datetime formats"""
+        assert len(ch) == 1, ch
+        return ch.isascii() and ch.isalpha();
+    @staticmethod
+    def __count_first(text):
+        """How many of text[0] appear at the start of text ?"""
+        assert text
+        return len(text) - len(text.lstrip(text[0]))
+    @classmethod
+    def __verbatim(cls, text):
+        # Used where our format coincides with LDML's, including on length.
+        n = cls.__count_first(text)
+        return text[:n], n
+    @classmethod
+    def __treat_as(cls, mimic, text):
+        # Helper for aliases
+        n = cls.__count_first(text)
+        return mimic * n, n
+
+    # Please follow alphabetic order, with two cases of the same
+    # letter adjacent, lower before upper.
+    @classmethod
+    def a(cls, text): # AM/PM indicator
+        return 'AP', cls.__count_first(text)
+
+    # A: Milliseconds in day. Not supported.
+
+    @classmethod
+    def c(cls, text): # Stand-alone local day of week
+        # Has length-variants for several cases Qt doesn't support, as
+        # do 'e' and 'E': just map all simply to weekday, abbreviated
+        # or full.
+        n = cls.__count_first(text)
+        return ('dddd' if n == 4 else 'ddd'), n
+
+    # C: Input skeleton symbol
+    d = __verbatim # day (of month or of week, depends on length)
+    # D: Day of year. Not supported.
+    e = c # Local day of week
+    E = c # Just plain day of week
+    # F: Day of week in month. Not supported.
+    # g: Modified julian day. Not supported.
+    # G: Era. Not supported.
+    h = __verbatim # Hour 1-12, treat as 0-11
+    H = __verbatim # Hour 0-23
+    # j: Input skeleton symbol
+    # J: Input skeleton symbol
+
+    @classmethod
+    def k(cls, text): # Hour 1-24, treat as 0-23
+        return cls.__treat_as('H', text)
+    @classmethod
+    def K(cls, text): # Hour 0-11
+        return cls.__treat_as('h', text)
+
+    # l: Deprecated Chinese leap month indicator.
+    @classmethod
+    def L(cls, text): # Stand-alone month names: treat as plain month names.
+        n = cls.__count_first(text)
+        # Length five is narrow; treat same as abbreviated; anything
+        # shorter matches Qt's month forms.
+        return ('MMM' if n > 4 else 'M' * n), n
+
+    m = __verbatim # Minute within the hour.
+    M = L # Plain month names, possibly abbreviated, and numbers.
+
+    # q: Quarter. Not supported.
+    # Q: Quarter. Not supported.
+
+    s = __verbatim # Seconds within the minute.
+    @classmethod
+    def S(cls, text): # Fractional seconds. Only milliseconds supported.
+        # FIXME: spec is unclear, do we need to include the leading
+        # dot or not ? For now, no known locale actually exercises
+        # this, so stick with what we've done on Darwin since long
+        # before adding support here.
+        n = cls.__count_first(text)
+        return ('z' if n < 3 else 'zzz'), n
+
+    # U: Cyclic Year Name. Not supported
+    @classmethod
+    def v(cls, text): # Generic non-location format. Map to abbreviation.
+        return 't', cls.__count_first(text)
+
+    V = v # Zone ID in various forms; VV is IANA ID. Map to abbreviation.
+    # w: Week of year. Not supported.
+    # W: Week of month. Not supported.
+
+    @classmethod
+    def y(cls, text): # Year number.
+        n = cls.__count_first(text)
+        return ('yy' if n == 2 else 'yyyy'), n
+    # Y: Year for Week-of-year calendars
+
+    z = v # Specific (i.e. distinguish standard from DST) non-location format.
+    Z = v # Offset format, optionaly with GMT (Qt uses UTC) prefix.
+
+    @staticmethod
+    def scanQuote(text): # Can't have ' as a method name, so handle specially
+        assert text.startswith("'")
+        i = text.find("'", 1) # Find the next; -1 if not present.
+        i = len(text) if i < 0 else i + 1 # Include the close-quote.
+        return text[:i], i
+
+    # Now put all of those to use:
+    @classmethod
+    def convert(cls, text):
+        """Convert a CLDR datetime format string into a Qt one.
+
+        Presumes that the caller will ''.join() the fragments it
+        yields. Each sequence of CLDR field symbols that corresponds
+        to a Qt format token is converted to it; all other CLDR field
+        symbols are discarded; the literals in between fields are
+        preserved verbatim, except that space and hyphen separators
+        immediately before a discarded field are discarded with it.
+
+        The approach is to look at the first symbol of the remainder
+        of the text, at each iteration, and use that first symbol to
+        select a function that will identify how much of the text to
+        consume and what to replace it with."""
+        sep = ''
+        while text:
+            ch = text[0]
+            if ch == "'":
+                quoted, length = cls.scanQuote(text)
+                text = text[length:]
+                sep += quoted
+            elif hasattr(cls, ch):
+                qtform, length = getattr(cls, ch)(text)
+                assert qtform and length > 0, (ch, text, qtform, length)
+                text = text[length:]
+                if sep:
+                    yield sep
+                    sep = ''
+                yield qtform
+            elif cls.__is_reserved(ch):
+                text = text[cls.__count_first(text):]
+                # Discard space or dash separator that was only there
+                # for the sake of the unsupported field:
+                sep = sep.rstrip(' -')
+                # TODO: should we also strip [ -]* from text
+                # immediately following unsupported forms ?
+            else:
+                sep += ch
+                text = text[1:]
+        if sep:
+            yield sep
+
+def convert_date(text):
+    # See Converter.convert()
+    return ''.join(Converter.convert(text))