QUnicodeTools: Fix line breaking for potential emojis

Implement part of LB30b introduced by UAX #14, revision 47
(Unicode 14.0.0):

    [\p{Extended_Pictographic}&\p{Cn}] × EM

This fixes one line breaking test.

Task-number: QTBUG-97537
Pick-to: 6.3
Change-Id: I3fd2372a057b7391d8846e9c146f69a54686ea61
Reviewed-by: Edward Welbourne <edward.welbourne@qt.io>
This commit is contained in:
Ievgenii Meshcheriakov 2022-05-05 16:11:14 +02:00
parent 08d2ae411f
commit 40b4ad1866
2 changed files with 12 additions and 1 deletions

View File

@ -604,6 +604,8 @@ static void getLineBreaks(const char16_t *string, qsizetype len, QCharAttributes
QUnicodeTables::LineBreakClass lcls = QUnicodeTables::LineBreak_LF; // to meet LB10
QUnicodeTables::LineBreakClass cls = lcls;
const QUnicodeTables::Properties *lastProp = QUnicodeTables::properties(U'\n');
for (qsizetype i = 0; i != len; ++i) {
qsizetype pos = i;
char32_t ucs4 = string[i];
@ -707,6 +709,14 @@ static void getLineBreaks(const char16_t *string, qsizetype len, QCharAttributes
goto next;
}
if (Q_UNLIKELY(ncls == QUnicodeTables::LineBreak_EM
&& lastProp->category == QChar::Other_NotAssigned
&& lastProp->graphemeBreakClass
== QUnicodeTables::GraphemeBreak_Extended_Pictographic)) {
// LB30b: [\p{Extended_Pictographic}&\p{Cn}] × EM
goto next;
}
// for South East Asian chars that require a complex analysis, the Unicode
// standard recommends to treat them as AL. tailoring that do dictionary analysis can override
if (Q_UNLIKELY(cls >= QUnicodeTables::LineBreak_SA))
@ -745,6 +755,7 @@ static void getLineBreaks(const char16_t *string, qsizetype len, QCharAttributes
next:
cls = ncls;
lastProp = prop;
next_no_cls_update:
lcls = ncls;
}

View File

@ -7678,7 +7678,7 @@
× 1F1F7 × 1F1FA ÷ 1F1F8 × 1F1EA ÷ # × [0.3] REGIONAL INDICATOR SYMBOL LETTER R (RI) × [30.11] REGIONAL INDICATOR SYMBOL LETTER U (RI) ÷ [30.13] REGIONAL INDICATOR SYMBOL LETTER S (RI) × [30.11] REGIONAL INDICATOR SYMBOL LETTER E (RI) ÷ [0.3]
× 1F1F7 × 1F1FA × 200B ÷ 1F1F8 × 1F1EA ÷ # × [0.3] REGIONAL INDICATOR SYMBOL LETTER R (RI) × [30.11] REGIONAL INDICATOR SYMBOL LETTER U (RI) × [7.02] ZERO WIDTH SPACE (ZW) ÷ [8.0] REGIONAL INDICATOR SYMBOL LETTER S (RI) × [30.12] REGIONAL INDICATOR SYMBOL LETTER E (RI) ÷ [0.3]
× 05D0 × 002D × 05D0 ÷ # × [0.3] HEBREW LETTER ALEF (HL) × [21.02] HYPHEN-MINUS (HY) × [21.1] HEBREW LETTER ALEF (HL) ÷ [0.3]
# × 1F02C × 1F3FF ÷ # × [0.3] <reserved-1F02C> (Other) × [30.22] EMOJI MODIFIER FITZPATRICK TYPE-6 (EM) ÷ [0.3]
× 1F02C × 1F3FF ÷ # × [0.3] <reserved-1F02C> (Other) × [30.22] EMOJI MODIFIER FITZPATRICK TYPE-6 (EM) ÷ [0.3]
× 00A9 ÷ 1F3FF ÷ # × [0.3] COPYRIGHT SIGN (AL) ÷ [999.0] EMOJI MODIFIER FITZPATRICK TYPE-6 (EM) ÷ [0.3]
#
# Lines: 7654