From 78cf89c07d7fa834a455afa5862823e50171415c Mon Sep 17 00:00:00 2001 From: Edward Welbourne Date: Fri, 31 Jul 2020 14:55:57 +0200 Subject: [PATCH] Use checked string iteration in case conversions The Unicode table code can only be safely called on valid code-points. So code that calls it must only pass it valid Unicode data. The string iterator's Unchecked Unchecked methods only provide this guarantee when the string being iterated is guaranteed to be valid UTF-16; while client code should only use QString, QStringView and friends on valid UTF-16 data, we have no way to be sure they have respected that. So take the few extra cycles to actually check validity in the course of iterating strings, when the resulting code-points are to be passed to the Unicode table look-ups. Add tests that case mapping doesn't access Unicode tables out of range (it'll trigger the new assertion). Added some comments to qchar.h that helped me understand surrogates. Change-Id: Iec2c3106bf1a875bdaa1d622f6cf94d7007e281e Reviewed-by: Thiago Macieira --- src/corelib/text/qchar.cpp | 1 + src/corelib/text/qchar.h | 6 ++-- src/corelib/text/qstring.cpp | 10 +++---- src/corelib/text/qunicodetables.cpp | 1 + .../auto/corelib/text/qstring/tst_qstring.cpp | 28 +++++++++++++++++++ util/unicode/main.cpp | 1 + 6 files changed, 40 insertions(+), 7 deletions(-) diff --git a/src/corelib/text/qchar.cpp b/src/corelib/text/qchar.cpp index 52b1a1cbece..4d156502ef0 100644 --- a/src/corelib/text/qchar.cpp +++ b/src/corelib/text/qchar.cpp @@ -1553,6 +1553,7 @@ static auto fullConvertCase(char32_t uc, QUnicodeTables::Case which) noexcept auto data() const { return chars; } auto size() const { return sz; } } result; + Q_ASSERT(uc <= QChar::LastValidCodePoint); auto pp = result.chars; diff --git a/src/corelib/text/qchar.h b/src/corelib/text/qchar.h index c3fe6d31072..fd9a4728c8d 100644 --- a/src/corelib/text/qchar.h +++ b/src/corelib/text/qchar.h @@ -504,11 +504,11 @@ public: } static constexpr inline bool isHighSurrogate(char32_t ucs4) noexcept { - return ((ucs4 & 0xfffffc00) == 0xd800); + return (ucs4 & 0xfffffc00) == 0xd800; // 0xd800 + up to 1023 (0x3ff) } static constexpr inline bool isLowSurrogate(char32_t ucs4) noexcept { - return ((ucs4 & 0xfffffc00) == 0xdc00); + return (ucs4 & 0xfffffc00) == 0xdc00; // 0xdc00 + up to 1023 (0x3ff) } static constexpr inline bool isSurrogate(char32_t ucs4) noexcept { @@ -520,6 +520,8 @@ public: } static constexpr inline char32_t surrogateToUcs4(char16_t high, char16_t low) noexcept { + // 0x010000 through 0x10ffff, provided params are actual high, low surrogates. + // 0x010000 + ((high - 0xd800) << 10) + (low - 0xdc00), optimized: return (char32_t(high)<<10) + low - 0x35fdc00; } static constexpr inline char32_t surrogateToUcs4(QChar high, QChar low) noexcept diff --git a/src/corelib/text/qstring.cpp b/src/corelib/text/qstring.cpp index 68b7011f158..068247adb48 100644 --- a/src/corelib/text/qstring.cpp +++ b/src/corelib/text/qstring.cpp @@ -4755,7 +4755,7 @@ bool QString::isUpper() const QStringIterator it(*this); while (it.hasNext()) { - const char32_t uc = it.nextUnchecked(); + const char32_t uc = it.next(); if (qGetProp(uc)->cases[QUnicodeTables::UpperCase].diff) return false; } @@ -4781,7 +4781,7 @@ bool QString::isLower() const QStringIterator it(*this); while (it.hasNext()) { - const char32_t uc = it.nextUnchecked(); + const char32_t uc = it.next(); if (qGetProp(uc)->cases[QUnicodeTables::LowerCase].diff) return false; } @@ -6143,7 +6143,7 @@ static QString detachAndConvertCase(T &str, QStringIterator it, QUnicodeTables:: QChar *pp = s.begin() + it.index(); // will detach if necessary do { - const auto folded = fullConvertCase(it.nextUnchecked(), which); + const auto folded = fullConvertCase(it.next(), which); if (Q_UNLIKELY(folded.size() > 1)) { if (folded.chars[0] == *pp && folded.size() == 2) { // special case: only second actually changed (e.g. surrogate pairs), @@ -6183,9 +6183,9 @@ static QString convertCase(T &str, QUnicodeTables::Case which) QStringIterator it(p, e); while (it.hasNext()) { - const char32_t uc = it.nextUnchecked(); + const char32_t uc = it.next(); if (qGetProp(uc)->cases[which].diff) { - it.recedeUnchecked(); + it.recede(); return detachAndConvertCase(str, it, which); } } diff --git a/src/corelib/text/qunicodetables.cpp b/src/corelib/text/qunicodetables.cpp index 9c6aa477f86..0809d338a58 100644 --- a/src/corelib/text/qunicodetables.cpp +++ b/src/corelib/text/qunicodetables.cpp @@ -9592,6 +9592,7 @@ static const Properties uc_properties[] = { Q_DECL_CONST_FUNCTION static inline const Properties *qGetProp(char32_t ucs4) noexcept { + Q_ASSERT(ucs4 <= QChar::LastValidCodePoint); if (ucs4 < 0x11000) return uc_properties + uc_property_trie[uc_property_trie[ucs4 >> 5] + (ucs4 & 0x1f)]; diff --git a/tests/auto/corelib/text/qstring/tst_qstring.cpp b/tests/auto/corelib/text/qstring/tst_qstring.cpp index 126bc089036..ad59025dfee 100644 --- a/tests/auto/corelib/text/qstring/tst_qstring.cpp +++ b/tests/auto/corelib/text/qstring/tst_qstring.cpp @@ -458,6 +458,8 @@ private slots: void simplified_data(); void simplified(); void trimmed(); + void unicodeTableAccess_data(); + void unicodeTableAccess(); void toUpper(); void toLower(); void isLower_isUpper_data(); @@ -1949,6 +1951,32 @@ void tst_QString::rightJustified() QCOMPARE(a, QLatin1String("ABC")); } +void tst_QString::unicodeTableAccess_data() +{ + QTest::addColumn("invalid"); + + const auto join = [](char16_t high, char16_t low) { + const QChar pair[2] = { high, low }; + return QString(pair, 2); + }; + // Least high surrogate for which an invalid successor produces an error: + QTest::newRow("least-high") << join(0xdbf8, 0xfc00); + // Least successor that, after a high surrogate, produces invalid: + QTest::newRow("least-follow") << join(0xdbff, 0xe000); +} + +void tst_QString::unicodeTableAccess() +{ + // QString processing must not access unicode tables out of bounds. + QFETCH(QString, invalid); + // Exercise methods, to see if any assertions trigger: + const auto upper = invalid.toUpper(); + const auto lower = invalid.toLower(); + const auto folded = invalid.toCaseFolded(); + // Fatuous test, just to use those. + QVERIFY(upper == invalid || lower == invalid || folded == invalid || lower != upper); +} + void tst_QString::toUpper() { QCOMPARE( QString().toUpper(), QString() ); diff --git a/util/unicode/main.cpp b/util/unicode/main.cpp index 167f632e373..df806eff0b6 100644 --- a/util/unicode/main.cpp +++ b/util/unicode/main.cpp @@ -2501,6 +2501,7 @@ static QByteArray createPropertyInfo() out += "Q_DECL_CONST_FUNCTION static inline const Properties *qGetProp(char32_t ucs4) noexcept\n" "{\n" + " Q_ASSERT(ucs4 <= QChar::LastValidCodePoint);\n" " if (ucs4 < 0x" + QByteArray::number(BMP_END, 16) + ")\n" " return uc_properties + uc_property_trie[uc_property_trie[ucs4 >> " + QByteArray::number(BMP_SHIFT) + "] + (ucs4 & 0x"