From dfb1f8d29351094bcf7d1bab91c69c93de890b10 Mon Sep 17 00:00:00 2001 From: Giuseppe D'Angelo Date: Sun, 22 Aug 2021 19:36:49 +0200 Subject: [PATCH] Unicode: fix the grapheme clustering algorithm An oversight in the code kept the algorithm in the GB11 state, even if the codepoint that is being processed wouldn't allow for that (for instance a sequence of ExtPic, Ext and Any). Refactor the code of GB11/GB12/GB13 to deal with code points that break the sequences (falling back to "normal" handling). Add some manual tests; interestingly enough, the failing cases are not covered by Unicode's tests, as we now pass the entire test suite. Amends a794c5e287381bd056008b20ae55f9b1e0acf138. Fixes: QTBUG-94951 Change-Id: If987d5ccf7c6b13de36d049b1b3d88a3c4b6dd00 Reviewed-by: Thiago Macieira (cherry picked from commit d48058f1970a795afb4cedaae54dde7ca69cb252) Reviewed-by: Qt Cherry-pick Bot --- src/corelib/text/qunicodetools.cpp | 70 +++++++----- .../tst_qtextboundaryfinder.cpp | 101 ++++++++++++++++++ 2 files changed, 142 insertions(+), 29 deletions(-) diff --git a/src/corelib/text/qunicodetools.cpp b/src/corelib/text/qunicodetools.cpp index 0acc80b5dc1..55ebfc8da81 100644 --- a/src/corelib/text/qunicodetools.cpp +++ b/src/corelib/text/qunicodetools.cpp @@ -163,9 +163,50 @@ static void getGraphemeBreaks(const char16_t *string, qsizetype len, QCharAttrib QUnicodeTables::GraphemeBreakClass cls = (QUnicodeTables::GraphemeBreakClass) prop->graphemeBreakClass; bool shouldBreak = GB::shouldBreakBetweenClasses(lcls, cls); + bool handled = false; switch (state) { case GB::State::Normal: + break; // will deal with it below + + case GB::State::GB11_ExtPicExt: + Q_ASSERT(lcls == QUnicodeTables::GraphemeBreak_Extend); + if (cls == QUnicodeTables::GraphemeBreak_Extend) { + // keep going in the current state + Q_ASSERT(!shouldBreak); // GB9, do not break before Extend + handled = true; + } else if (cls == QUnicodeTables::GraphemeBreak_ZWJ) { + state = GB::State::GB11_ExtPicExtZWJ; + Q_ASSERT(!shouldBreak); // GB9, do not break before ZWJ + handled = true; + } else { + state = GB::State::Normal; + } + break; + + case GB::State::GB11_ExtPicExtZWJ: + Q_ASSERT(lcls == QUnicodeTables::GraphemeBreak_ZWJ); + if (cls == QUnicodeTables::GraphemeBreak_Extended_Pictographic) { + shouldBreak = false; + handled = true; + } + + state = GB::State::Normal; + break; + + case GB::State::GB12_13_RI: + Q_ASSERT(lcls == QUnicodeTables::GraphemeBreak_RegionalIndicator); + if (cls == QUnicodeTables::GraphemeBreak_RegionalIndicator) { + shouldBreak = false; + handled = true; + } + + state = GB::State::Normal; + break; + } + + if (!handled) { + Q_ASSERT(state == GB::State::Normal); if (lcls == QUnicodeTables::GraphemeBreak_Extended_Pictographic) { // GB11 if (cls == QUnicodeTables::GraphemeBreak_Extend) { state = GB::State::GB11_ExtPicExt; @@ -177,35 +218,6 @@ static void getGraphemeBreaks(const char16_t *string, qsizetype len, QCharAttrib } else if (cls == QUnicodeTables::GraphemeBreak_RegionalIndicator) { // GB12, GB13 state = GB::State::GB12_13_RI; } - - break; - case GB::State::GB11_ExtPicExt: - Q_ASSERT(lcls == QUnicodeTables::GraphemeBreak_Extend); - if (cls == QUnicodeTables::GraphemeBreak_Extend) { - // keep going in the current state - Q_ASSERT(!shouldBreak); // GB9, do not break before Extend - } else if (cls == QUnicodeTables::GraphemeBreak_ZWJ) { - state = GB::State::GB11_ExtPicExtZWJ; - Q_ASSERT(!shouldBreak); // GB9, do not break before ZWJ - } - - break; - - case GB::State::GB11_ExtPicExtZWJ: - Q_ASSERT(lcls == QUnicodeTables::GraphemeBreak_ZWJ); - if (cls == QUnicodeTables::GraphemeBreak_Extended_Pictographic) - shouldBreak = false; - - state = GB::State::Normal; - break; - - case GB::State::GB12_13_RI: - Q_ASSERT(lcls == QUnicodeTables::GraphemeBreak_RegionalIndicator); - if (cls == QUnicodeTables::GraphemeBreak_RegionalIndicator) - shouldBreak = false; - - state = GB::State::Normal; - break; } if (shouldBreak) diff --git a/tests/auto/corelib/text/qtextboundaryfinder/tst_qtextboundaryfinder.cpp b/tests/auto/corelib/text/qtextboundaryfinder/tst_qtextboundaryfinder.cpp index b48b751ea0b..f9b597f604c 100644 --- a/tests/auto/corelib/text/qtextboundaryfinder/tst_qtextboundaryfinder.cpp +++ b/tests/auto/corelib/text/qtextboundaryfinder/tst_qtextboundaryfinder.cpp @@ -51,6 +51,9 @@ private slots: void lineBoundariesDefault(); #endif + void graphemeBoundaries_manual_data(); + void graphemeBoundaries_manual(); + void wordBoundaries_manual_data(); void wordBoundaries_manual(); void sentenceBoundaries_manual_data(); @@ -286,6 +289,104 @@ void tst_QTextBoundaryFinder::lineBoundariesDefault() } #endif // QT_BUILD_INTERNAL +void tst_QTextBoundaryFinder::graphemeBoundaries_manual_data() +{ + QTest::addColumn("testString"); + QTest::addColumn>("expectedBreakPositions"); + + { + // QTBUG-94951 + QChar s[] = { QChar(0x2764), // U+2764 HEAVY BLACK HEART + QChar(0xFE0F), // U+FE0F VARIATION SELECTOR-16 + QChar(0xD83D), QChar(0xDCF2), // U+1F4F2 MOBILE PHONE WITH RIGHTWARDS ARROW AT LEFT + QChar(0xD83D), QChar(0xDCE9), // U+1F4E9 ENVELOPE WITH DOWNWARDS ARROW ABOVE + }; + QString testString(s, sizeof(s)/sizeof(s[0])); + + QList expectedBreakPositions{0, 2, 4, 6}; + QTest::newRow("+EXTPICxEXT+EXTPIC+EXTPIC+") << testString << expectedBreakPositions; + } + + { + QChar s[] = { QChar(0x2764), // U+2764 HEAVY BLACK HEART + QChar(0xFE0F), // U+FE0F VARIATION SELECTOR-16 + QChar(0x2764), // U+2764 HEAVY BLACK HEART + QChar(0xFE0F), // U+FE0F VARIATION SELECTOR-16 + }; + QString testString(s, sizeof(s)/sizeof(s[0])); + + QList expectedBreakPositions{0, 2, 4}; + QTest::newRow("+EXTPICxEXT+EXTPICxEXT+") << testString << expectedBreakPositions; + } + + { + QChar s[] = { QChar(0x2764), // U+2764 HEAVY BLACK HEART + QChar(0xFE0F), // U+FE0F VARIATION SELECTOR-16 + QChar(0xFE0F), // U+FE0F VARIATION SELECTOR-16 + QChar(0xFE0F), // U+FE0F VARIATION SELECTOR-16 + QChar(0x2764), // U+2764 HEAVY BLACK HEART + QChar(0xFE0F), // U+FE0F VARIATION SELECTOR-16 + QChar(0xFE0F), // U+FE0F VARIATION SELECTOR-16 + }; + QString testString(s, sizeof(s)/sizeof(s[0])); + + QList expectedBreakPositions{0, 4, 7}; + QTest::newRow("+EXTPICxEXTxEXTxEXT+EXTPICxEXTxEXT+") << testString << expectedBreakPositions; + } + + { + QChar s[] = { QChar(0x2764), // U+2764 HEAVY BLACK HEART + QChar(0xFE0F), // U+FE0F VARIATION SELECTOR-16 + QChar(0xFE0F), // U+FE0F VARIATION SELECTOR-16 + QChar(0x200D), // U+200D ZERO WIDTH JOINER + QChar(0xD83D), QChar(0xDCF2), // U+1F4F2 MOBILE PHONE WITH RIGHTWARDS ARROW AT LEFT + QChar(0xFE0F), // U+FE0F VARIATION SELECTOR-16 + }; + QString testString(s, sizeof(s)/sizeof(s[0])); + + QList expectedBreakPositions{0, 7}; + QTest::newRow("+EXTPICxEXTxEXTxZWJxEXTPICxEXTxEXT+") << testString << expectedBreakPositions; + } + + { + QChar s[] = { QChar(0x2764), // U+2764 HEAVY BLACK HEART + QChar(0xFE0F), // U+FE0F VARIATION SELECTOR-16 + QChar(0xFE0F), // U+FE0F VARIATION SELECTOR-16 + QChar(0x200D), // U+200D ZERO WIDTH JOINER + QChar(0x0041), // U+0041 LATIN CAPITAL LETTER A + QChar(0xD83D), QChar(0xDCF2), // U+1F4F2 MOBILE PHONE WITH RIGHTWARDS ARROW AT LEFT + }; + QString testString(s, sizeof(s)/sizeof(s[0])); + + QList expectedBreakPositions{0, 4, 5, 7}; + QTest::newRow("+EXTPICxEXTxEXTxZWJ+Any+EXTPIC+") << testString << expectedBreakPositions; + } + + { + QChar s[] = { QChar(0x2764), // U+2764 HEAVY BLACK HEART + QChar(0xFE0F), // U+FE0F VARIATION SELECTOR-16 + QChar(0xD83C), QChar(0xDDEA), // U+1F1EA REGIONAL INDICATOR SYMBOL LETTER E + QChar(0xD83C), QChar(0xDDFA), // U+1F1FA REGIONAL INDICATOR SYMBOL LETTER U + QChar(0xD83C), QChar(0xDDEA), // U+1F1EA REGIONAL INDICATOR SYMBOL LETTER E + QChar(0xD83C), QChar(0xDDFA), // U+1F1FA REGIONAL INDICATOR SYMBOL LETTER U + QChar(0xD83C), QChar(0xDDEA), // U+1F1EA REGIONAL INDICATOR SYMBOL LETTER E + QChar(0x0041), // U+0041 LATIN CAPITAL LETTER A + }; + QString testString(s, sizeof(s)/sizeof(s[0])); + + QList expectedBreakPositions{0, 2, 6, 10, 12, 13}; + QTest::newRow("+EXTPICxEXT+RIxRI+RIxRI+RI+ANY+") << testString << expectedBreakPositions; + } +} + +void tst_QTextBoundaryFinder::graphemeBoundaries_manual() +{ + QFETCH(QString, testString); + QFETCH(QList, expectedBreakPositions); + + doTestData(testString, expectedBreakPositions, QTextBoundaryFinder::Grapheme); +} + void tst_QTextBoundaryFinder::wordBoundaries_manual_data() { QTest::addColumn("testString");