From 2df508a41d1c9968def08158fa7a30df0dd958fc Mon Sep 17 00:00:00 2001 From: Thiago Macieira Date: Tue, 24 Jan 2023 13:18:06 -0800 Subject: [PATCH] QRegularExpression: fix count() when the RE matches a surrogate When the match finds a surrogate pair as the first true Unicode character, then we need to skip both code units of the pair in order to restart the search. PCRE2 does not allow us to search for individual UTF-16 code units. That actually means that counting "." gives us the count of Unicode characters. Fixes: QTBUG-110586 Change-Id: I194d0a32c94148f398e6fffd173d5b5be8137e19 Reviewed-by: Giuseppe D'Angelo Reviewed-by: Marc Mutz (cherry picked from commit b22ae069ac193cfa0479d0bc258a860ef00816b4) Reviewed-by: Qt Cherry-pick Bot --- src/corelib/text/qstring.cpp | 8 +++++++- tests/auto/corelib/text/qstring/tst_qstring.cpp | 15 +++++++++++++++ 2 files changed, 22 insertions(+), 1 deletion(-) diff --git a/src/corelib/text/qstring.cpp b/src/corelib/text/qstring.cpp index 5f151ffbd67..0e3bcf914f0 100644 --- a/src/corelib/text/qstring.cpp +++ b/src/corelib/text/qstring.cpp @@ -10996,8 +10996,14 @@ qsizetype QtPrivate::count(QStringView haystack, const QRegularExpression &re) QRegularExpressionMatch match = re.matchView(haystack, index + 1); if (!match.hasMatch()) break; - index = match.capturedStart(); count++; + + // Search again, from the next character after the beginning of this + // capture. If the capture starts with a surrogate pair, both together + // count as "one character". + index = match.capturedStart(); + if (index < len && haystack[index].isHighSurrogate()) + ++index; } return count; } diff --git a/tests/auto/corelib/text/qstring/tst_qstring.cpp b/tests/auto/corelib/text/qstring/tst_qstring.cpp index aec2d2897a4..701a33a2635 100644 --- a/tests/auto/corelib/text/qstring/tst_qstring.cpp +++ b/tests/auto/corelib/text/qstring/tst_qstring.cpp @@ -2041,6 +2041,21 @@ void tst_QString::count() QTest::ignoreMessage(QtWarningMsg, ignoreMessagePattern); QCOMPARE(emptyStr.count(QRegularExpression("invalid regex\\")), 0); #endif + + QString nonBmpString = u8"\U00010000\U00010000abc\U00010000"; + QCOMPARE(nonBmpString.count(u"\U00010000"), 3); +#if QT_CONFIG(regularexpression) + QCOMPARE(nonBmpString.count(QRegularExpression(u8"\U00010000")), 3); + QCOMPARE(nonBmpString.count(QRegularExpression(u8"\U00010000a?")), 3); + QCOMPARE(nonBmpString.count(QRegularExpression(u8"\U00010000a")), 1); + QCOMPARE(nonBmpString.count(QRegularExpression(".")), 6); + + // can't search for unpaired surrogates + QTest::ignoreMessage(QtWarningMsg, ignoreMessagePattern); + QCOMPARE(nonBmpString.count(QRegularExpression(QChar(0xd800))), 0); + QTest::ignoreMessage(QtWarningMsg, ignoreMessagePattern); + QCOMPARE(nonBmpString.count(QRegularExpression(QChar(0xdc00))), 0); +#endif } void tst_QString::contains()