Unicode: Extract EastAsianWidth property

This property is needed to properly implement the line breaking
algorithm from UAX #14.

Task-number: QTBUG-97537
Pick-to: 6.3
Change-Id: Ia83cc553c9ef19fae33560721630849d2a95af84
Reviewed-by: Edward Welbourne <edward.welbourne@qt.io>
This commit is contained in:
Ievgenii Meshcheriakov 2022-05-06 13:44:58 +02:00
parent 40b4ad1866
commit 838a7a01f3
4 changed files with 12294 additions and 9449 deletions

File diff suppressed because it is too large Load Diff

View File

@ -43,7 +43,8 @@ struct Properties {
ushort joining : 3;
signed short digitValue : 5;
signed short mirrorDiff : 16;
ushort unicodeVersion : 8; /* 5 used */
ushort unicodeVersion : 5; /* 5 used */
ushort eastAsianWidth : 3; /* 3 used */
ushort nfQuickCheck : 8;
#ifdef Q_OS_WASM
unsigned char : 0; //wasm 64 packing trick
@ -68,6 +69,15 @@ Q_CORE_EXPORT const Properties * QT_FASTCALL properties(char16_t ucs2) noexcept;
static_assert(sizeof(Properties) == 20);
enum class EastAsianWidth : unsigned int {
A,
F,
H,
N,
Na,
W,
};
enum GraphemeBreakClass {
GraphemeBreak_Any,
GraphemeBreak_CR,
@ -179,6 +189,10 @@ Q_CORE_EXPORT QStringView QT_FASTCALL idnaMapping(char32_t usc4) noexcept;
inline QStringView idnaMapping(QChar ch) noexcept
{ return idnaMapping(ch.unicode()); }
Q_CORE_EXPORT EastAsianWidth QT_FASTCALL eastAsianWidth(char32_t ucs4) noexcept;
inline EastAsianWidth eastAsianWidth(QChar ch) noexcept
{ return eastAsianWidth(ch.unicode()); }
} // namespace QUnicodeTables
QT_END_NAMESPACE

File diff suppressed because it is too large Load Diff

View File

@ -59,6 +59,47 @@ static void initAgeMap()
}
}
static const char *east_asian_width_string =
R"(enum class EastAsianWidth : unsigned int {
A,
F,
H,
N,
Na,
W,
};
)";
enum class EastAsianWidth : unsigned int {
A,
F,
H,
N,
Na,
W,
};
static QHash<QByteArray, EastAsianWidth> eastAsianWidthMap;
static void initEastAsianWidthMap()
{
constexpr struct W {
EastAsianWidth width;
const char *name;
} widths[] = {
{ EastAsianWidth::A, "A" },
{ EastAsianWidth::F, "F" },
{ EastAsianWidth::H, "H" },
{ EastAsianWidth::N, "N" },
{ EastAsianWidth::Na, "Na" },
{ EastAsianWidth::W, "W" },
};
for (auto &w : widths)
eastAsianWidthMap.insert(w.name, w.width);
}
static QHash<QByteArray, QChar::Category> categoryMap;
static void initCategoryMap()
@ -849,7 +890,8 @@ static const char *property_string =
" ushort joining : 3;\n"
" signed short digitValue : 5;\n"
" signed short mirrorDiff : 16;\n"
" ushort unicodeVersion : 8; /* 5 used */\n"
" ushort unicodeVersion : 5; /* 5 used */\n"
" ushort eastAsianWidth : 3; /* 3 used */\n"
" ushort nfQuickCheck : 8;\n" // could be narrowed
"#ifdef Q_OS_WASM\n"
" unsigned char : 0; //wasm 64 packing trick\n"
@ -896,6 +938,10 @@ static const char *methods =
"Q_CORE_EXPORT QStringView QT_FASTCALL idnaMapping(char32_t usc4) noexcept;\n"
"inline QStringView idnaMapping(QChar ch) noexcept\n"
"{ return idnaMapping(ch.unicode()); }\n"
"\n"
"Q_CORE_EXPORT EastAsianWidth QT_FASTCALL eastAsianWidth(char32_t ucs4) noexcept;\n"
"inline EastAsianWidth eastAsianWidth(QChar ch) noexcept\n"
"{ return eastAsianWidth(ch.unicode()); }\n"
"\n";
static const int SizeOfPropertiesStruct = 20;
@ -918,6 +964,7 @@ struct PropertyFlags {
&& direction == o.direction
&& joining == o.joining
&& age == o.age
&& eastAsianWidth == o.eastAsianWidth
&& digitValue == o.digitValue
&& mirrorDiff == o.mirrorDiff
&& lowerCaseDiff == o.lowerCaseDiff
@ -945,6 +992,8 @@ struct PropertyFlags {
QChar::JoiningType joining : 3;
// from DerivedAge.txt
QChar::UnicodeVersion age : 5;
// From EastAsianWidth.txt
EastAsianWidth eastAsianWidth = EastAsianWidth::N;
int digitValue = -1;
int mirrorDiff : 16;
@ -1483,6 +1532,52 @@ static void readDerivedAge()
}
}
static void readEastAsianWidth()
{
qDebug("Reading EastAsianWidth.txt");
QFile f("data/EastAsianWidth.txt");
if (!f.exists() || !f.open(QFile::ReadOnly))
qFatal("Couldn't find or read EastAsianWidth.txt");
while (!f.atEnd()) {
QByteArray line = f.readLine().trimmed();
int comment = line.indexOf('#');
line = (comment < 0 ? line : line.left(comment)).simplified();
if (line.isEmpty())
continue;
QList<QByteArray> fields = line.split(';');
Q_ASSERT(fields.size() == 2);
// That would be split(".."), but that API does not exist.
const QByteArray codePoints = fields[0].trimmed().replace("..", ".");
QList<QByteArray> cl = codePoints.split('.');
Q_ASSERT(cl.size() >= 1 && cl.size() <= 2);
const QByteArray widthString = fields[1].trimmed();
if (!eastAsianWidthMap.contains(widthString)) {
qFatal("Unhandled EastAsianWidth property value for %s: %s",
qPrintable(codePoints), qPrintable(widthString));
}
auto width = eastAsianWidthMap.value(widthString);
bool ok;
const int first = cl[0].toInt(&ok, 16);
const int last = ok && cl.size() == 2 ? cl[1].toInt(&ok, 16) : first;
Q_ASSERT(ok);
for (int codepoint = first; codepoint <= last; ++codepoint) {
UnicodeData &ud = UnicodeData::valueRef(codepoint);
// Ensure that ranges don't overlap.
Q_ASSERT(ud.p.eastAsianWidth == EastAsianWidth::N);
ud.p.eastAsianWidth = width;
}
}
}
static void readDerivedNormalizationProps()
{
qDebug("Reading DerivedNormalizationProps.txt");
@ -2896,9 +2991,12 @@ static QByteArray createPropertyInfo()
// " signed short mirrorDiff : 16;\n"
out += QByteArray::number( p.mirrorDiff );
out += ", ";
// " ushort unicodeVersion : 8; /* 5 used */\n"
// " ushort unicodeVersion : 5; /* 5 used */\n"
out += QByteArray::number( p.age );
out += ", ";
// " ushort eastAsianWidth : 3;" /* 3 used */\n"
out += QByteArray::number( static_cast<unsigned int>(p.eastAsianWidth) );
out += ", ";
// " ushort nfQuickCheck : 8;\n"
out += QByteArray::number( p.nfQuickCheck );
out += ", ";
@ -3003,6 +3101,11 @@ static QByteArray createPropertyInfo()
"{\n"
" return static_cast<IdnaStatus>(qGetProp(ucs4)->idnaStatus);\n"
"}\n"
"\n"
"Q_CORE_EXPORT EastAsianWidth QT_FASTCALL eastAsianWidth(char32_t ucs4) noexcept\n"
"{\n"
" return static_cast<EastAsianWidth>(qGetProp(ucs4)->eastAsianWidth);\n"
"}\n"
"\n";
return out;
@ -3458,6 +3561,7 @@ QByteArray createCasingInfo()
int main(int, char **)
{
initAgeMap();
initEastAsianWidthMap();
initCategoryMap();
initDecompositionMap();
initDirectionMap();
@ -3473,6 +3577,7 @@ int main(int, char **)
readBidiMirroring();
readArabicShaping();
readDerivedAge();
readEastAsianWidth();
readDerivedNormalizationProps();
readSpecialCasing();
readCaseFolding();
@ -3548,6 +3653,7 @@ int main(int, char **)
f.write("namespace QUnicodeTables {\n\n");
f.write(property_string);
f.write(sizeOfPropertiesStructCheck);
f.write(east_asian_width_string);
f.write(grapheme_break_class_string);
f.write(word_break_class_string);
f.write(sentence_break_class_string);