From 40b4ad1866b4c48fa7a64bc2f07c27125398fdba Mon Sep 17 00:00:00 2001 From: Ievgenii Meshcheriakov Date: Thu, 5 May 2022 16:11:14 +0200 Subject: [PATCH] QUnicodeTools: Fix line breaking for potential emojis MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Implement part of LB30b introduced by UAX #14, revision 47 (Unicode 14.0.0): [\p{Extended_Pictographic}&\p{Cn}] × EM This fixes one line breaking test. Task-number: QTBUG-97537 Pick-to: 6.3 Change-Id: I3fd2372a057b7391d8846e9c146f69a54686ea61 Reviewed-by: Edward Welbourne --- src/corelib/text/qunicodetools.cpp | 11 +++++++++++ .../text/qtextboundaryfinder/data/LineBreakTest.txt | 2 +- 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/src/corelib/text/qunicodetools.cpp b/src/corelib/text/qunicodetools.cpp index ac6302362da..8f1eb2e5325 100644 --- a/src/corelib/text/qunicodetools.cpp +++ b/src/corelib/text/qunicodetools.cpp @@ -604,6 +604,8 @@ static void getLineBreaks(const char16_t *string, qsizetype len, QCharAttributes QUnicodeTables::LineBreakClass lcls = QUnicodeTables::LineBreak_LF; // to meet LB10 QUnicodeTables::LineBreakClass cls = lcls; + const QUnicodeTables::Properties *lastProp = QUnicodeTables::properties(U'\n'); + for (qsizetype i = 0; i != len; ++i) { qsizetype pos = i; char32_t ucs4 = string[i]; @@ -707,6 +709,14 @@ static void getLineBreaks(const char16_t *string, qsizetype len, QCharAttributes goto next; } + if (Q_UNLIKELY(ncls == QUnicodeTables::LineBreak_EM + && lastProp->category == QChar::Other_NotAssigned + && lastProp->graphemeBreakClass + == QUnicodeTables::GraphemeBreak_Extended_Pictographic)) { + // LB30b: [\p{Extended_Pictographic}&\p{Cn}] × EM + goto next; + } + // for South East Asian chars that require a complex analysis, the Unicode // standard recommends to treat them as AL. tailoring that do dictionary analysis can override if (Q_UNLIKELY(cls >= QUnicodeTables::LineBreak_SA)) @@ -745,6 +755,7 @@ static void getLineBreaks(const char16_t *string, qsizetype len, QCharAttributes next: cls = ncls; + lastProp = prop; next_no_cls_update: lcls = ncls; } diff --git a/tests/auto/corelib/text/qtextboundaryfinder/data/LineBreakTest.txt b/tests/auto/corelib/text/qtextboundaryfinder/data/LineBreakTest.txt index 1b038cdce9b..32d66183197 100644 --- a/tests/auto/corelib/text/qtextboundaryfinder/data/LineBreakTest.txt +++ b/tests/auto/corelib/text/qtextboundaryfinder/data/LineBreakTest.txt @@ -7678,7 +7678,7 @@ × 1F1F7 × 1F1FA ÷ 1F1F8 × 1F1EA ÷ # × [0.3] REGIONAL INDICATOR SYMBOL LETTER R (RI) × [30.11] REGIONAL INDICATOR SYMBOL LETTER U (RI) ÷ [30.13] REGIONAL INDICATOR SYMBOL LETTER S (RI) × [30.11] REGIONAL INDICATOR SYMBOL LETTER E (RI) ÷ [0.3] × 1F1F7 × 1F1FA × 200B ÷ 1F1F8 × 1F1EA ÷ # × [0.3] REGIONAL INDICATOR SYMBOL LETTER R (RI) × [30.11] REGIONAL INDICATOR SYMBOL LETTER U (RI) × [7.02] ZERO WIDTH SPACE (ZW) ÷ [8.0] REGIONAL INDICATOR SYMBOL LETTER S (RI) × [30.12] REGIONAL INDICATOR SYMBOL LETTER E (RI) ÷ [0.3] × 05D0 × 002D × 05D0 ÷ # × [0.3] HEBREW LETTER ALEF (HL) × [21.02] HYPHEN-MINUS (HY) × [21.1] HEBREW LETTER ALEF (HL) ÷ [0.3] -# × 1F02C × 1F3FF ÷ # × [0.3] (Other) × [30.22] EMOJI MODIFIER FITZPATRICK TYPE-6 (EM) ÷ [0.3] +× 1F02C × 1F3FF ÷ # × [0.3] (Other) × [30.22] EMOJI MODIFIER FITZPATRICK TYPE-6 (EM) ÷ [0.3] × 00A9 ÷ 1F3FF ÷ # × [0.3] COPYRIGHT SIGN (AL) ÷ [999.0] EMOJI MODIFIER FITZPATRICK TYPE-6 (EM) ÷ [0.3] # # Lines: 7654