Use CLDR's names in QLocale::*ToName() for language, script, territory

Various comments need to continue using the enumdata.py names, as they
associate data with particular enum members, but we can now correctly
use the en.xml versions of their names when we report them, rather
than the enum-friendly names we use in the code. Since this now means
the data may stray outside plain ASCII - it'll be UTF-8-encoded - this
implies replacing the QLatin1StringView()s of the code that formerly
read this data with QString::fromUtf8().

Fixes: QTBUG-94460
Change-Id: Id3b08875a46af58c0555c3e303b0e15a19441509
Reviewed-by: Qt CI Bot <qt_ci_bot@qt-project.org>
Reviewed-by: Thiago Macieira <thiago.macieira@intel.com>
This commit is contained in:
Edward Welbourne 2023-08-01 12:35:26 +02:00
parent afd7d68244
commit 1ae24f8b50
8 changed files with 811 additions and 736 deletions

View File

@ -1568,7 +1568,7 @@ QString QLocale::languageToString(Language language)
{
if (language > QLocale::LastLanguage)
return "Unknown"_L1;
return QLatin1StringView(language_name_list + language_name_index[language]);
return QString::fromUtf8(language_name_list + language_name_index[language]);
}
/*!
@ -1582,7 +1582,7 @@ QString QLocale::territoryToString(QLocale::Territory territory)
{
if (territory > QLocale::LastTerritory)
return "Unknown"_L1;
return QLatin1StringView(territory_name_list + territory_name_index[territory]);
return QString::fromUtf8(territory_name_list + territory_name_index[territory]);
}
#if QT_DEPRECATED_SINCE(6, 6)
@ -1610,7 +1610,7 @@ QString QLocale::scriptToString(QLocale::Script script)
{
if (script > QLocale::LastScript)
return "Unknown"_L1;
return QLatin1StringView(script_name_list + script_name_index[script]);
return QString::fromUtf8(script_name_list + script_name_index[script]);
}
/*!

File diff suppressed because it is too large Load Diff

View File

@ -3382,9 +3382,20 @@ void tst_QLocale::languageToString_data()
QTest::addColumn<QString>("name");
// Prone to change at CLDR updates.
QTest::newRow("cu") << QLocale::Church << u"Church Slavic"_s;
QTest::newRow("dyo") << QLocale::JolaFonyi << u"Jola-Fonyi"_s;
QTest::newRow("ff") << QLocale::Fulah << u"Fula"_s;
QTest::newRow("gd") << QLocale::Gaelic << u"Scottish Gaelic"_s;
QTest::newRow("ht") << QLocale::Haitian << u"Haitian Creole"_s;
QTest::newRow("lu") << QLocale::LubaKatanga << u"Luba-Katanga"_s;
QTest::newRow("mgh") << QLocale::MakhuwaMeetto << u"Makhuwa-Meetto"_s;
QTest::newRow("mgo") << QLocale::Meta << u"Meta\u02bc"_s;
QTest::newRow("mi") << QLocale::Maori << u"M\u0101" "ori"_s;
QTest::newRow("nb") << QLocale::NorwegianBokmal << u"Norwegian Bokm\u00e5" "l"_s;
QTest::newRow("nqo") << QLocale::Nko << u"N\u2019" "Ko"_s;
QTest::newRow("quc") << QLocale::Kiche << u"K\u02bc" "iche\u02bc"_s;
QTest::newRow("sah") << QLocale::Sakha << u"Yakut"_s;
QTest::newRow("vo") << QLocale::Volapuk << u"Volap\u00fc" "k"_s;
}
void tst_QLocale::languageToString()
@ -3401,9 +3412,15 @@ void tst_QLocale::scriptToString_data()
QTest::addColumn<QString>("name");
// Prone to change at CLDR updates.
QTest::newRow("Cans")
<< QLocale::CanadianAboriginalScript << u"Unified Canadian Aboriginal Syllabics"_s;
QTest::newRow("Dupl") << QLocale::DuployanScript << u"Duployan shorthand"_s;
QTest::newRow("Egyp") << QLocale::EgyptianHieroglyphsScript << u"Egyptian hieroglyphs"_s;
QTest::newRow("Nkoo") << QLocale::NkoScript << u"N\u2019" "Ko"_s;
QTest::newRow("Phag") << QLocale::PhagsPaScript << u"Phags-pa"_s;
QTest::newRow("Rohg") << QLocale::HanifiScript << u"Hanifi Rohingya"_s;
QTest::newRow("Sgnw") << QLocale::SignWritingScript << u"SignWriting"_s;
QTest::newRow("Xsux") << QLocale::CuneiformScript << u"Sumero-Akkadian Cuneiform"_s;
}
void tst_QLocale::scriptToString()
@ -3420,11 +3437,43 @@ void tst_QLocale::territoryToString_data()
QTest::addColumn<QString>("name");
// Prone to change at CLDR updates.
QTest::newRow("AX") << QLocale::AlandIslands << u"\u00c5" "land Islands"_s;
QTest::newRow("AG") << QLocale::AntiguaAndBarbuda << u"Antigua & Barbuda"_s;
QTest::newRow("BA") << QLocale::BosniaAndHerzegovina << u"Bosnia & Herzegovina"_s;
QTest::newRow("BL") << QLocale::SaintBarthelemy << u"St. Barth\u00e9" "lemy"_s;
QTest::newRow("CC") << QLocale::CocosIslands << u"Cocos (Keeling) Islands"_s;
QTest::newRow("CD") << QLocale::CongoKinshasa << u"Congo - Kinshasa"_s;
QTest::newRow("CG") << QLocale::CongoBrazzaville << u"Congo - Brazzaville"_s;
QTest::newRow("CI") << QLocale::IvoryCoast << u"C\u00f4" "te d\u2019" "Ivoire"_s;
QTest::newRow("CW") << QLocale::Curacao << u"Cura\u00e7" "ao"_s;
QTest::newRow("EA") << QLocale::CeutaAndMelilla << u"Ceuta & Melilla"_s;
QTest::newRow("GS")
<< QLocale::SouthGeorgiaAndSouthSandwichIslands
<< u"South Georgia & South Sandwich Islands"_s;
QTest::newRow("GW") << QLocale::GuineaBissau << u"Guinea-Bissau"_s;
QTest::newRow("HM") << QLocale::HeardAndMcDonaldIslands << u"Heard & McDonald Islands"_s;
QTest::newRow("IM") << QLocale::IsleOfMan << u"Isle of Man"_s;
QTest::newRow("KN") << QLocale::SaintKittsAndNevis << u"St. Kitts & Nevis"_s;
QTest::newRow("LC") << QLocale::SaintLucia << u"St. Lucia"_s;
QTest::newRow("MF") << QLocale::SaintMartin << u"St. Martin"_s;
QTest::newRow("MK") << QLocale::Macedonia << u"North Macedonia"_s;
QTest::newRow("MM") << QLocale::Myanmar << u"Myanmar (Burma)"_s;
QTest::newRow("MO") << QLocale::Macao << u"Macao SAR China"_s;
QTest::newRow("PM") << QLocale::SaintPierreAndMiquelon << u"St. Pierre & Miquelon"_s;
QTest::newRow("PN") << QLocale::Pitcairn << u"Pitcairn Islands"_s;
QTest::newRow("RE") << QLocale::Reunion << u"R\u00e9" "union"_s;
QTest::newRow("SH") << QLocale::SaintHelena << u"St. Helena"_s;
QTest::newRow("SJ") << QLocale::SvalbardAndJanMayen << u"Svalbard & Jan Mayen"_s;
QTest::newRow("ST")
<< QLocale::SaoTomeAndPrincipe << u"S\u00e3" "o Tom\u00e9" " & Pr\u00ed" "ncipe"_s;
QTest::newRow("TA") << QLocale::TristanDaCunha << u"Tristan da Cunha"_s;
QTest::newRow("TC") << QLocale::TurksAndCaicosIslands << u"Turks & Caicos Islands"_s;
QTest::newRow("TR") << QLocale::Turkey << u"T\u00fc" "rkiye"_s;
QTest::newRow("TT") << QLocale::TrinidadAndTobago << u"Trinidad & Tobago"_s;
QTest::newRow("UM") << QLocale::UnitedStatesOutlyingIslands << u"U.S. Outlying Islands"_s;
QTest::newRow("VC") << QLocale::SaintVincentAndGrenadines << u"St. Vincent & Grenadines"_s;
QTest::newRow("VI") << QLocale::UnitedStatesVirginIslands << u"U.S. Virgin Islands"_s;
QTest::newRow("WF") << QLocale::WallisAndFutuna << u"Wallis & Futuna"_s;
QTest::newRow("001") << QLocale::World << u"world"_s;
}

View File

@ -254,6 +254,9 @@ class CldrAccess (object):
inheritance, where relevant."""
return LocaleScanner(name, self.__localeRoots(name), self.__rootLocale)
def englishNaming(self, tag): # see QLocaleXmlWriter.enumData()
return self.__codeMap(tag).get
@property
def fileLocales(self) -> Iterable[str]:
"""Generator for locale IDs seen in file-names.

View File

@ -76,7 +76,7 @@ def main(out, err):
writer = QLocaleXmlWriter(emit.write)
writer.version(reader.root.cldrVersion)
writer.enumData()
writer.enumData(reader.root.englishNaming)
writer.likelySubTags(reader.likelySubTags())
writer.locales(reader.readLocales(args.calendars), args.calendars)

View File

@ -75,7 +75,7 @@ def names_clash(cldr, enum):
cldr = cldr[:f].rstrip() + ' ' + cldr[t + 1:].lstrip()
# Various accented letters:
remap = { 'å': 'a', 'ã': 'a', 'ç': 'c', 'é': 'e', 'í': 'i', 'ô': 'o', 'ü': 'u'}
remap = { 'ã': 'a', 'å': 'a', 'ā': 'a', 'ç': 'c', 'é': 'e', 'í': 'i', 'ô': 'o', 'ü': 'u'}
skip = '\u02bc' # Punctuation for which .isalpha() is true.
# Let cldr match (ignoring non-letters and case) any substring as enum:
if ''.join(enum.lower().split()) in ''.join(

View File

@ -100,16 +100,20 @@ def convertFormat(format):
class QLocaleXmlReader (object):
def __init__(self, filename):
self.root = self.__parse(filename)
# Lists of (id, name, code) triples:
languages = tuple(self.__loadMap('language'))
scripts = tuple(self.__loadMap('script'))
territories = tuple(self.__loadMap('territory'))
from enumdata import language_map, script_map, territory_map
# Lists of (id, enum name, code, en.xml name) tuples:
languages = tuple(self.__loadMap('language', language_map))
scripts = tuple(self.__loadMap('script', script_map))
territories = tuple(self.__loadMap('territory', territory_map))
self.__likely = tuple(self.__likelySubtagsMap())
# Mappings {ID: (name, code)}
# Mappings {ID: (enum name, code, en.xml name)}
self.languages = dict((v[0], v[1:]) for v in languages)
self.scripts = dict((v[0], v[1:]) for v in scripts)
self.territories = dict((v[0], v[1:]) for v in territories)
# Private mappings {name: (ID, code)}
# Private mappings {enum name: (ID, code)}
self.__langByName = dict((v[1], (v[0], v[2])) for v in languages)
self.__textByName = dict((v[1], (v[0], v[2])) for v in scripts)
self.__landByName = dict((v[1], (v[0], v[2])) for v in territories)
@ -211,10 +215,11 @@ class QLocaleXmlReader (object):
return name
# Implementation details:
def __loadMap(self, category):
def __loadMap(self, category, enum):
kid = self.__firstChildText
for element in self.__eachEltInGroup(self.root, f'{category}List', category):
yield int(kid(element, 'id')), kid(element, 'name'), kid(element, 'code')
key = int(kid(element, 'id'))
yield key, enum[key][0], kid(element, 'code'), kid(element, 'name')
def __likelySubtagsMap(self):
def triplet(element, keys=('language', 'script', 'territory'), kid = self.__firstChildText):
@ -341,11 +346,21 @@ class QLocaleXmlWriter (object):
self.__write('<localeDatabase>')
# Output of various sections, in their usual order:
def enumData(self):
def enumData(self, code2name):
"""Output name/id/code tables for language, script and territory.
Parameter, code2name, is a function taking 'language',
'script' or 'territory' and returning a lookup function that
maps codes, of the relevant type, to their English names. This
lookup function is passed a code and the name, both taken from
enumdata.py, that QLocale uses, so the .get() of a dict will
work. The English name from this lookup will be used by
QLocale::*ToString() for the enum member whose name is based
on the enumdata.py name passed as fallback to the lookup."""
from enumdata import language_map, script_map, territory_map
self.__enumTable('language', language_map)
self.__enumTable('script', script_map)
self.__enumTable('territory', territory_map)
self.__enumTable('language', language_map, code2name)
self.__enumTable('script', script_map, code2name)
self.__enumTable('territory', territory_map, code2name)
# Prepare to detect any unused codes (see __writeLocale(), close()):
self.__languages = set(p[1] for p in language_map.values()
if not p[1].isspace())
@ -407,13 +422,18 @@ class QLocaleXmlWriter (object):
def __complain(text):
raise Error('Attempted to write data after closing :-(')
def __enumTable(self, tag, table):
@staticmethod
def __xmlSafe(text):
return text.replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;')
def __enumTable(self, tag, table, code2name):
self.__openTag(f'{tag}List')
for key, value in table.items():
enname, safe = code2name(tag), self.__xmlSafe
for key, (name, code) in table.items():
self.__openTag(tag)
self.inTag('name', value[0])
self.inTag('name', safe(enname(code, name)))
self.inTag('id', key)
self.inTag('code', value[1])
self.inTag('code', code)
self.__closeTag(tag)
self.__closeTag(f'{tag}List')

View File

@ -20,7 +20,7 @@ from pathlib import Path
from typing import Optional
from qlocalexml import QLocaleXmlReader
from localetools import unicode2hex, wrap_list, Error, Transcriber, SourceFileEditor, qtbase_root
from localetools import *
from iso639_3 import LanguageCodeData
class LocaleKeySorter:
@ -337,7 +337,11 @@ class LocaleDataWriter (LocaleSourceEditor):
for key, value in book.items():
if key == 0:
continue
out(f'"{value[0]}\\0"\n')
enum, name = value[0], value[-1]
if names_clash(name, enum):
out(f'"{name}\\0" // {enum}\n')
else:
out(f'"{name}\\0"\n') # Automagically utf-8 encoded
out(';\n\n')
out(f'static constexpr quint16 {form}_name_index[] = {{\n')
@ -346,9 +350,8 @@ class LocaleDataWriter (LocaleSourceEditor):
for key, value in book.items():
if key == 0:
continue
name = value[0]
out(f'{index:6d}, // {name}\n')
index += len(name) + 1
out(f'{index:6d}, // {value[0]}\n')
index += len(value[-1].encode('utf-8')) + 1
out('};\n\n')
@staticmethod