Use CLDR's names in QLocale::*ToName() for language, script, territory

Various comments need to continue using the enumdata.py names, as they
associate data with particular enum members, but we can now correctly
use the en.xml versions of their names when we report them, rather
than the enum-friendly names we use in the code. Since this now means
the data may stray outside plain ASCII - it'll be UTF-8-encoded - this
implies replacing the QLatin1StringView()s of the code that formerly
read this data with QString::fromUtf8().

Fixes: QTBUG-94460
Change-Id: Id3b08875a46af58c0555c3e303b0e15a19441509
Reviewed-by: Qt CI Bot <qt_ci_bot@qt-project.org>
Reviewed-by: Thiago Macieira <thiago.macieira@intel.com>
This commit is contained in:
Edward Welbourne 2023-08-01 12:35:26 +02:00
parent afd7d68244
commit 1ae24f8b50
8 changed files with 811 additions and 736 deletions

View File

@ -1568,7 +1568,7 @@ QString QLocale::languageToString(Language language)
{ {
if (language > QLocale::LastLanguage) if (language > QLocale::LastLanguage)
return "Unknown"_L1; return "Unknown"_L1;
return QLatin1StringView(language_name_list + language_name_index[language]); return QString::fromUtf8(language_name_list + language_name_index[language]);
} }
/*! /*!
@ -1582,7 +1582,7 @@ QString QLocale::territoryToString(QLocale::Territory territory)
{ {
if (territory > QLocale::LastTerritory) if (territory > QLocale::LastTerritory)
return "Unknown"_L1; return "Unknown"_L1;
return QLatin1StringView(territory_name_list + territory_name_index[territory]); return QString::fromUtf8(territory_name_list + territory_name_index[territory]);
} }
#if QT_DEPRECATED_SINCE(6, 6) #if QT_DEPRECATED_SINCE(6, 6)
@ -1610,7 +1610,7 @@ QString QLocale::scriptToString(QLocale::Script script)
{ {
if (script > QLocale::LastScript) if (script > QLocale::LastScript)
return "Unknown"_L1; return "Unknown"_L1;
return QLatin1StringView(script_name_list + script_name_index[script]); return QString::fromUtf8(script_name_list + script_name_index[script]);
} }
/*! /*!

File diff suppressed because it is too large Load Diff

View File

@ -3382,9 +3382,20 @@ void tst_QLocale::languageToString_data()
QTest::addColumn<QString>("name"); QTest::addColumn<QString>("name");
// Prone to change at CLDR updates. // Prone to change at CLDR updates.
QTest::newRow("cu") << QLocale::Church << u"Church Slavic"_s;
QTest::newRow("dyo") << QLocale::JolaFonyi << u"Jola-Fonyi"_s; QTest::newRow("dyo") << QLocale::JolaFonyi << u"Jola-Fonyi"_s;
QTest::newRow("ff") << QLocale::Fulah << u"Fula"_s;
QTest::newRow("gd") << QLocale::Gaelic << u"Scottish Gaelic"_s;
QTest::newRow("ht") << QLocale::Haitian << u"Haitian Creole"_s;
QTest::newRow("lu") << QLocale::LubaKatanga << u"Luba-Katanga"_s; QTest::newRow("lu") << QLocale::LubaKatanga << u"Luba-Katanga"_s;
QTest::newRow("mgh") << QLocale::MakhuwaMeetto << u"Makhuwa-Meetto"_s; QTest::newRow("mgh") << QLocale::MakhuwaMeetto << u"Makhuwa-Meetto"_s;
QTest::newRow("mgo") << QLocale::Meta << u"Meta\u02bc"_s;
QTest::newRow("mi") << QLocale::Maori << u"M\u0101" "ori"_s;
QTest::newRow("nb") << QLocale::NorwegianBokmal << u"Norwegian Bokm\u00e5" "l"_s;
QTest::newRow("nqo") << QLocale::Nko << u"N\u2019" "Ko"_s;
QTest::newRow("quc") << QLocale::Kiche << u"K\u02bc" "iche\u02bc"_s;
QTest::newRow("sah") << QLocale::Sakha << u"Yakut"_s;
QTest::newRow("vo") << QLocale::Volapuk << u"Volap\u00fc" "k"_s;
} }
void tst_QLocale::languageToString() void tst_QLocale::languageToString()
@ -3401,9 +3412,15 @@ void tst_QLocale::scriptToString_data()
QTest::addColumn<QString>("name"); QTest::addColumn<QString>("name");
// Prone to change at CLDR updates. // Prone to change at CLDR updates.
QTest::newRow("Cans")
<< QLocale::CanadianAboriginalScript << u"Unified Canadian Aboriginal Syllabics"_s;
QTest::newRow("Dupl") << QLocale::DuployanScript << u"Duployan shorthand"_s;
QTest::newRow("Egyp") << QLocale::EgyptianHieroglyphsScript << u"Egyptian hieroglyphs"_s; QTest::newRow("Egyp") << QLocale::EgyptianHieroglyphsScript << u"Egyptian hieroglyphs"_s;
QTest::newRow("Nkoo") << QLocale::NkoScript << u"N\u2019" "Ko"_s;
QTest::newRow("Phag") << QLocale::PhagsPaScript << u"Phags-pa"_s; QTest::newRow("Phag") << QLocale::PhagsPaScript << u"Phags-pa"_s;
QTest::newRow("Rohg") << QLocale::HanifiScript << u"Hanifi Rohingya"_s;
QTest::newRow("Sgnw") << QLocale::SignWritingScript << u"SignWriting"_s; QTest::newRow("Sgnw") << QLocale::SignWritingScript << u"SignWriting"_s;
QTest::newRow("Xsux") << QLocale::CuneiformScript << u"Sumero-Akkadian Cuneiform"_s;
} }
void tst_QLocale::scriptToString() void tst_QLocale::scriptToString()
@ -3420,11 +3437,43 @@ void tst_QLocale::territoryToString_data()
QTest::addColumn<QString>("name"); QTest::addColumn<QString>("name");
// Prone to change at CLDR updates. // Prone to change at CLDR updates.
QTest::newRow("AX") << QLocale::AlandIslands << u"\u00c5" "land Islands"_s;
QTest::newRow("AG") << QLocale::AntiguaAndBarbuda << u"Antigua & Barbuda"_s;
QTest::newRow("BA") << QLocale::BosniaAndHerzegovina << u"Bosnia & Herzegovina"_s;
QTest::newRow("BL") << QLocale::SaintBarthelemy << u"St. Barth\u00e9" "lemy"_s;
QTest::newRow("CC") << QLocale::CocosIslands << u"Cocos (Keeling) Islands"_s;
QTest::newRow("CD") << QLocale::CongoKinshasa << u"Congo - Kinshasa"_s; QTest::newRow("CD") << QLocale::CongoKinshasa << u"Congo - Kinshasa"_s;
QTest::newRow("CG") << QLocale::CongoBrazzaville << u"Congo - Brazzaville"_s; QTest::newRow("CG") << QLocale::CongoBrazzaville << u"Congo - Brazzaville"_s;
QTest::newRow("CI") << QLocale::IvoryCoast << u"C\u00f4" "te d\u2019" "Ivoire"_s;
QTest::newRow("CW") << QLocale::Curacao << u"Cura\u00e7" "ao"_s;
QTest::newRow("EA") << QLocale::CeutaAndMelilla << u"Ceuta & Melilla"_s;
QTest::newRow("GS")
<< QLocale::SouthGeorgiaAndSouthSandwichIslands
<< u"South Georgia & South Sandwich Islands"_s;
QTest::newRow("GW") << QLocale::GuineaBissau << u"Guinea-Bissau"_s; QTest::newRow("GW") << QLocale::GuineaBissau << u"Guinea-Bissau"_s;
QTest::newRow("HM") << QLocale::HeardAndMcDonaldIslands << u"Heard & McDonald Islands"_s;
QTest::newRow("IM") << QLocale::IsleOfMan << u"Isle of Man"_s; QTest::newRow("IM") << QLocale::IsleOfMan << u"Isle of Man"_s;
QTest::newRow("KN") << QLocale::SaintKittsAndNevis << u"St. Kitts & Nevis"_s;
QTest::newRow("LC") << QLocale::SaintLucia << u"St. Lucia"_s;
QTest::newRow("MF") << QLocale::SaintMartin << u"St. Martin"_s;
QTest::newRow("MK") << QLocale::Macedonia << u"North Macedonia"_s;
QTest::newRow("MM") << QLocale::Myanmar << u"Myanmar (Burma)"_s;
QTest::newRow("MO") << QLocale::Macao << u"Macao SAR China"_s;
QTest::newRow("PM") << QLocale::SaintPierreAndMiquelon << u"St. Pierre & Miquelon"_s;
QTest::newRow("PN") << QLocale::Pitcairn << u"Pitcairn Islands"_s;
QTest::newRow("RE") << QLocale::Reunion << u"R\u00e9" "union"_s;
QTest::newRow("SH") << QLocale::SaintHelena << u"St. Helena"_s;
QTest::newRow("SJ") << QLocale::SvalbardAndJanMayen << u"Svalbard & Jan Mayen"_s;
QTest::newRow("ST")
<< QLocale::SaoTomeAndPrincipe << u"S\u00e3" "o Tom\u00e9" " & Pr\u00ed" "ncipe"_s;
QTest::newRow("TA") << QLocale::TristanDaCunha << u"Tristan da Cunha"_s; QTest::newRow("TA") << QLocale::TristanDaCunha << u"Tristan da Cunha"_s;
QTest::newRow("TC") << QLocale::TurksAndCaicosIslands << u"Turks & Caicos Islands"_s;
QTest::newRow("TR") << QLocale::Turkey << u"T\u00fc" "rkiye"_s;
QTest::newRow("TT") << QLocale::TrinidadAndTobago << u"Trinidad & Tobago"_s;
QTest::newRow("UM") << QLocale::UnitedStatesOutlyingIslands << u"U.S. Outlying Islands"_s;
QTest::newRow("VC") << QLocale::SaintVincentAndGrenadines << u"St. Vincent & Grenadines"_s;
QTest::newRow("VI") << QLocale::UnitedStatesVirginIslands << u"U.S. Virgin Islands"_s;
QTest::newRow("WF") << QLocale::WallisAndFutuna << u"Wallis & Futuna"_s;
QTest::newRow("001") << QLocale::World << u"world"_s; QTest::newRow("001") << QLocale::World << u"world"_s;
} }

View File

@ -254,6 +254,9 @@ class CldrAccess (object):
inheritance, where relevant.""" inheritance, where relevant."""
return LocaleScanner(name, self.__localeRoots(name), self.__rootLocale) return LocaleScanner(name, self.__localeRoots(name), self.__rootLocale)
def englishNaming(self, tag): # see QLocaleXmlWriter.enumData()
return self.__codeMap(tag).get
@property @property
def fileLocales(self) -> Iterable[str]: def fileLocales(self) -> Iterable[str]:
"""Generator for locale IDs seen in file-names. """Generator for locale IDs seen in file-names.

View File

@ -76,7 +76,7 @@ def main(out, err):
writer = QLocaleXmlWriter(emit.write) writer = QLocaleXmlWriter(emit.write)
writer.version(reader.root.cldrVersion) writer.version(reader.root.cldrVersion)
writer.enumData() writer.enumData(reader.root.englishNaming)
writer.likelySubTags(reader.likelySubTags()) writer.likelySubTags(reader.likelySubTags())
writer.locales(reader.readLocales(args.calendars), args.calendars) writer.locales(reader.readLocales(args.calendars), args.calendars)

View File

@ -75,7 +75,7 @@ def names_clash(cldr, enum):
cldr = cldr[:f].rstrip() + ' ' + cldr[t + 1:].lstrip() cldr = cldr[:f].rstrip() + ' ' + cldr[t + 1:].lstrip()
# Various accented letters: # Various accented letters:
remap = { 'å': 'a', 'ã': 'a', 'ç': 'c', 'é': 'e', 'í': 'i', 'ô': 'o', 'ü': 'u'} remap = { 'ã': 'a', 'å': 'a', 'ā': 'a', 'ç': 'c', 'é': 'e', 'í': 'i', 'ô': 'o', 'ü': 'u'}
skip = '\u02bc' # Punctuation for which .isalpha() is true. skip = '\u02bc' # Punctuation for which .isalpha() is true.
# Let cldr match (ignoring non-letters and case) any substring as enum: # Let cldr match (ignoring non-letters and case) any substring as enum:
if ''.join(enum.lower().split()) in ''.join( if ''.join(enum.lower().split()) in ''.join(

View File

@ -100,16 +100,20 @@ def convertFormat(format):
class QLocaleXmlReader (object): class QLocaleXmlReader (object):
def __init__(self, filename): def __init__(self, filename):
self.root = self.__parse(filename) self.root = self.__parse(filename)
# Lists of (id, name, code) triples:
languages = tuple(self.__loadMap('language')) from enumdata import language_map, script_map, territory_map
scripts = tuple(self.__loadMap('script')) # Lists of (id, enum name, code, en.xml name) tuples:
territories = tuple(self.__loadMap('territory')) languages = tuple(self.__loadMap('language', language_map))
scripts = tuple(self.__loadMap('script', script_map))
territories = tuple(self.__loadMap('territory', territory_map))
self.__likely = tuple(self.__likelySubtagsMap()) self.__likely = tuple(self.__likelySubtagsMap())
# Mappings {ID: (name, code)}
# Mappings {ID: (enum name, code, en.xml name)}
self.languages = dict((v[0], v[1:]) for v in languages) self.languages = dict((v[0], v[1:]) for v in languages)
self.scripts = dict((v[0], v[1:]) for v in scripts) self.scripts = dict((v[0], v[1:]) for v in scripts)
self.territories = dict((v[0], v[1:]) for v in territories) self.territories = dict((v[0], v[1:]) for v in territories)
# Private mappings {name: (ID, code)}
# Private mappings {enum name: (ID, code)}
self.__langByName = dict((v[1], (v[0], v[2])) for v in languages) self.__langByName = dict((v[1], (v[0], v[2])) for v in languages)
self.__textByName = dict((v[1], (v[0], v[2])) for v in scripts) self.__textByName = dict((v[1], (v[0], v[2])) for v in scripts)
self.__landByName = dict((v[1], (v[0], v[2])) for v in territories) self.__landByName = dict((v[1], (v[0], v[2])) for v in territories)
@ -211,10 +215,11 @@ class QLocaleXmlReader (object):
return name return name
# Implementation details: # Implementation details:
def __loadMap(self, category): def __loadMap(self, category, enum):
kid = self.__firstChildText kid = self.__firstChildText
for element in self.__eachEltInGroup(self.root, f'{category}List', category): for element in self.__eachEltInGroup(self.root, f'{category}List', category):
yield int(kid(element, 'id')), kid(element, 'name'), kid(element, 'code') key = int(kid(element, 'id'))
yield key, enum[key][0], kid(element, 'code'), kid(element, 'name')
def __likelySubtagsMap(self): def __likelySubtagsMap(self):
def triplet(element, keys=('language', 'script', 'territory'), kid = self.__firstChildText): def triplet(element, keys=('language', 'script', 'territory'), kid = self.__firstChildText):
@ -341,11 +346,21 @@ class QLocaleXmlWriter (object):
self.__write('<localeDatabase>') self.__write('<localeDatabase>')
# Output of various sections, in their usual order: # Output of various sections, in their usual order:
def enumData(self): def enumData(self, code2name):
"""Output name/id/code tables for language, script and territory.
Parameter, code2name, is a function taking 'language',
'script' or 'territory' and returning a lookup function that
maps codes, of the relevant type, to their English names. This
lookup function is passed a code and the name, both taken from
enumdata.py, that QLocale uses, so the .get() of a dict will
work. The English name from this lookup will be used by
QLocale::*ToString() for the enum member whose name is based
on the enumdata.py name passed as fallback to the lookup."""
from enumdata import language_map, script_map, territory_map from enumdata import language_map, script_map, territory_map
self.__enumTable('language', language_map) self.__enumTable('language', language_map, code2name)
self.__enumTable('script', script_map) self.__enumTable('script', script_map, code2name)
self.__enumTable('territory', territory_map) self.__enumTable('territory', territory_map, code2name)
# Prepare to detect any unused codes (see __writeLocale(), close()): # Prepare to detect any unused codes (see __writeLocale(), close()):
self.__languages = set(p[1] for p in language_map.values() self.__languages = set(p[1] for p in language_map.values()
if not p[1].isspace()) if not p[1].isspace())
@ -407,13 +422,18 @@ class QLocaleXmlWriter (object):
def __complain(text): def __complain(text):
raise Error('Attempted to write data after closing :-(') raise Error('Attempted to write data after closing :-(')
def __enumTable(self, tag, table): @staticmethod
def __xmlSafe(text):
return text.replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;')
def __enumTable(self, tag, table, code2name):
self.__openTag(f'{tag}List') self.__openTag(f'{tag}List')
for key, value in table.items(): enname, safe = code2name(tag), self.__xmlSafe
for key, (name, code) in table.items():
self.__openTag(tag) self.__openTag(tag)
self.inTag('name', value[0]) self.inTag('name', safe(enname(code, name)))
self.inTag('id', key) self.inTag('id', key)
self.inTag('code', value[1]) self.inTag('code', code)
self.__closeTag(tag) self.__closeTag(tag)
self.__closeTag(f'{tag}List') self.__closeTag(f'{tag}List')

View File

@ -20,7 +20,7 @@ from pathlib import Path
from typing import Optional from typing import Optional
from qlocalexml import QLocaleXmlReader from qlocalexml import QLocaleXmlReader
from localetools import unicode2hex, wrap_list, Error, Transcriber, SourceFileEditor, qtbase_root from localetools import *
from iso639_3 import LanguageCodeData from iso639_3 import LanguageCodeData
class LocaleKeySorter: class LocaleKeySorter:
@ -337,7 +337,11 @@ class LocaleDataWriter (LocaleSourceEditor):
for key, value in book.items(): for key, value in book.items():
if key == 0: if key == 0:
continue continue
out(f'"{value[0]}\\0"\n') enum, name = value[0], value[-1]
if names_clash(name, enum):
out(f'"{name}\\0" // {enum}\n')
else:
out(f'"{name}\\0"\n') # Automagically utf-8 encoded
out(';\n\n') out(';\n\n')
out(f'static constexpr quint16 {form}_name_index[] = {{\n') out(f'static constexpr quint16 {form}_name_index[] = {{\n')
@ -346,9 +350,8 @@ class LocaleDataWriter (LocaleSourceEditor):
for key, value in book.items(): for key, value in book.items():
if key == 0: if key == 0:
continue continue
name = value[0] out(f'{index:6d}, // {value[0]}\n')
out(f'{index:6d}, // {name}\n') index += len(value[-1].encode('utf-8')) + 1
index += len(name) + 1
out('};\n\n') out('};\n\n')
@staticmethod @staticmethod