QRegularExpression: fix count() when the RE matches a surrogate

When the match finds a surrogate pair as the first true Unicode character,
then we need to skip both code units of the pair in order to restart the
search. PCRE2 does not allow us to search for individual UTF-16 code
units.

That actually means that counting "." gives us the count of Unicode
characters.

Fixes: QTBUG-110586
Change-Id: I194d0a32c94148f398e6fffd173d5b5be8137e19
Reviewed-by: Giuseppe D'Angelo <giuseppe.dangelo@kdab.com>
Reviewed-by: Marc Mutz <marc.mutz@qt.io>
(cherry picked from commit b22ae069ac193cfa0479d0bc258a860ef00816b4)
Reviewed-by: Qt Cherry-pick Bot <cherrypick_bot@qt-project.org>
This commit is contained in:
Thiago Macieira 2023-01-24 13:18:06 -08:00 committed by Qt Cherry-pick Bot
parent e070aebb1f
commit 2df508a41d
2 changed files with 22 additions and 1 deletions

View File

@ -10996,8 +10996,14 @@ qsizetype QtPrivate::count(QStringView haystack, const QRegularExpression &re)
QRegularExpressionMatch match = re.matchView(haystack, index + 1); QRegularExpressionMatch match = re.matchView(haystack, index + 1);
if (!match.hasMatch()) if (!match.hasMatch())
break; break;
index = match.capturedStart();
count++; count++;
// Search again, from the next character after the beginning of this
// capture. If the capture starts with a surrogate pair, both together
// count as "one character".
index = match.capturedStart();
if (index < len && haystack[index].isHighSurrogate())
++index;
} }
return count; return count;
} }

View File

@ -2041,6 +2041,21 @@ void tst_QString::count()
QTest::ignoreMessage(QtWarningMsg, ignoreMessagePattern); QTest::ignoreMessage(QtWarningMsg, ignoreMessagePattern);
QCOMPARE(emptyStr.count(QRegularExpression("invalid regex\\")), 0); QCOMPARE(emptyStr.count(QRegularExpression("invalid regex\\")), 0);
#endif #endif
QString nonBmpString = u8"\U00010000\U00010000abc\U00010000";
QCOMPARE(nonBmpString.count(u"\U00010000"), 3);
#if QT_CONFIG(regularexpression)
QCOMPARE(nonBmpString.count(QRegularExpression(u8"\U00010000")), 3);
QCOMPARE(nonBmpString.count(QRegularExpression(u8"\U00010000a?")), 3);
QCOMPARE(nonBmpString.count(QRegularExpression(u8"\U00010000a")), 1);
QCOMPARE(nonBmpString.count(QRegularExpression(".")), 6);
// can't search for unpaired surrogates
QTest::ignoreMessage(QtWarningMsg, ignoreMessagePattern);
QCOMPARE(nonBmpString.count(QRegularExpression(QChar(0xd800))), 0);
QTest::ignoreMessage(QtWarningMsg, ignoreMessagePattern);
QCOMPARE(nonBmpString.count(QRegularExpression(QChar(0xdc00))), 0);
#endif
} }
void tst_QString::contains() void tst_QString::contains()