From 8764a0c79db30f61cc7f49f80d3bbf7150e1c90f Mon Sep 17 00:00:00 2001 From: Thiago Macieira Date: Tue, 9 Apr 2024 19:50:15 -0500 Subject: [PATCH] CBOR: fix sorting of UTF16-to-UTF16 strings This amends commit 394788c68efacdec2676988b4b4ff207b20557f2 (its ChangeLog applies to this commit too). That fixed sorting of UTF8-to- UTF16, but when adding more unit tests, I've discovered that some UTF-16 strings also sorted incorrectly. There were two problems: First, we were assuming that we could rely on the UTF-16 length as a proxy for the UTF-8 one, but that's not true for some cases: * both 1-, 2- and 3-codepoint UTF-8 sequences are 1 codepoint in UTF-16, so some strings would have identical UTF-16 length * 4-codepoint UTF-8 sequences shrink to 2-codepoint UTF-16 ones (2:1) but 3-codepoint UTF-8 sequences shrink to 1 (3:1), so some strings would be longer in UTF-16 but shorter in UTF-8. Second, QtPrivate::compareStrings performs UTF-16 codepoint comparisons not Unicode character ones, so surrogate pairs were sorting before U+E000 to U+FFFF. To fix all of this, we need to decode the UTF-16 string into UTF-32 and calculate the length of that in UTF-8 to be sure we have the sorting order right. Since this is a slight behavior change with a performance penalty, I am choosing to backport only to 6.7. The penalty mostly does not apply to 6.8 due to commit 61556627f25e7c7acbfcc5e54127a392b5239977. Change-Id: If1bf59ecbe014b569ba1fffd17c4c4ddcc874aac Reviewed-by: Ivan Solovev (cherry picked from commit d4c7da9a07dc1434692fe08a61ba22c794574c4f) Reviewed-by: Thiago Macieira --- src/corelib/serialization/qcborvalue.cpp | 69 ++++++++++- .../qcborvalue/tst_qcborvalue.cpp | 111 ++++++++++++++---- 2 files changed, 153 insertions(+), 27 deletions(-) diff --git a/src/corelib/serialization/qcborvalue.cpp b/src/corelib/serialization/qcborvalue.cpp index e415343008c..15b5a1af4c1 100644 --- a/src/corelib/serialization/qcborvalue.cpp +++ b/src/corelib/serialization/qcborvalue.cpp @@ -1087,6 +1087,68 @@ QCborValue QCborContainerPrivate::extractAt_complex(Element e) return makeValue(e.type, 0, container); } +// Similar to QStringIterator::next() but returns malformed surrogate pair +// itself when one is detected, and returns the length in UTF-8. +static auto nextUtf32Character(const char16_t *&ptr, const char16_t *end) noexcept +{ + Q_ASSERT(ptr != end); + struct R { + char32_t c; + qsizetype len = 1; // in UTF-8 code units (bytes) + } r = { *ptr++ }; + + if (r.c < 0x0800) { + if (r.c >= 0x0080) + ++r.len; + } else if (!QChar::isHighSurrogate(r.c) || ptr == end) { + r.len += 2; + } else { + r.len += 3; + r.c = QChar::surrogateToUcs4(r.c, *ptr++); + } + + return r; +} + +static qsizetype stringLengthInUtf8(const char16_t *ptr, const char16_t *end) noexcept +{ + qsizetype len = 0; + while (ptr < end) + len += nextUtf32Character(ptr, end).len; + return len; +} + +static int compareStringsInUtf8(QStringView lhs, QStringView rhs) noexcept +{ + // The UTF-16 length is *usually* comparable, but not always. There are + // pathological cases where they can be wrong, so we need to compare as if + // we were doing it in UTF-8. That includes the case of UTF-16 surrogate + // pairs, because qstring.cpp sorts them before U+E000-U+FFFF. + int diff = 0; + qsizetype len1 = 0; + qsizetype len2 = 0; + const char16_t *src1 = lhs.utf16(); + const char16_t *src2 = rhs.utf16(); + const char16_t *end1 = src1 + lhs.size(); + const char16_t *end2 = src2 + rhs.size(); + + // first, scan until we find a difference (if any) + do { + auto r1 = nextUtf32Character(src1, end1); + auto r2 = nextUtf32Character(src2, end2); + len1 += r1.len; + len2 += r2.len; + diff = int(r1.c) - int(r2.c); // no underflow due to limited range + } while (src1 < end1 && src2 < end2 && diff == 0); + + // compute the full length past this first difference + len1 += stringLengthInUtf8(src1, end1); + len2 += stringLengthInUtf8(src2, end2); + if (len1 == len2) + return diff; + return len1 < len2 ? -1 : 1; +} + QT_WARNING_DISABLE_MSVC(4146) // unary minus operator applied to unsigned type, result still unsigned static int compareContainer(const QCborContainerPrivate *c1, const QCborContainerPrivate *c2); static int compareElementNoData(const Element &e1, const Element &e2) @@ -1167,11 +1229,8 @@ static int compareElementRecursive(const QCborContainerPrivate *c1, const Elemen // 5) UTF-8 and US-ASCII // 6) US-ASCII and US-ASCII if ((e1.flags & Element::StringIsUtf16) && (e2.flags & Element::StringIsUtf16)) { - // Case 1: both UTF-16, so lengths are comparable. - // (we can't use memcmp in little-endian machines) - if (len1 == len2) - return QtPrivate::compareStrings(b1->asStringView(), b2->asStringView()); - return len1 < len2 ? -1 : 1; + // Case 1: both UTF-16 + return compareStringsInUtf8(b1->asStringView(), b2->asStringView()); } if (!(e1.flags & Element::StringIsUtf16) && !(e2.flags & Element::StringIsUtf16)) { diff --git a/tests/auto/corelib/serialization/qcborvalue/tst_qcborvalue.cpp b/tests/auto/corelib/serialization/qcborvalue/tst_qcborvalue.cpp index ca95db9fd60..8ea6495d973 100644 --- a/tests/auto/corelib/serialization/qcborvalue/tst_qcborvalue.cpp +++ b/tests/auto/corelib/serialization/qcborvalue/tst_qcborvalue.cpp @@ -1798,6 +1798,18 @@ void tst_QCborValue::mapNested() void tst_QCborValue::sorting() { + // CBOR data comparisons are done as if we were comparing their canonically + // (deterministic) encoded forms in the byte stream, including the Major + // Type. That has a few surprises noted below: + // 1) because the length of a string precedes it, effectively strings are + // sorted by their UTF-8 length before their contents + // 2) because negative integers are stored in negated form, they sort in + // descending order (i.e. by absolute value) + // 3) negative integers (Major Type 1) sort after all positive integers + // (Major Type 0) + // Effectively, this means integers are sorted as sign+magnitude. + // 4) floating point types (Major Type 7) sort after all integers + QCborValue vundef, vnull(nullptr); QCborValue vtrue(true), vfalse(false); QCborValue vint1(1), vint2(2); @@ -1830,7 +1842,6 @@ void tst_QCborValue::sorting() CHECK_ORDER(vint1, vint2); CHECK_ORDER(vdouble1, vdouble2); CHECK_ORDER(vndouble1, vndouble2); - // note: shorter length sorts first CHECK_ORDER(vba1, vba2); CHECK_ORDER(vba2, vba3); CHECK_ORDER(vs1, vs2); @@ -1887,29 +1898,85 @@ void tst_QCborValue::sorting() str.prepend(char(QCborValue::String) + str.size()); return QCborValue::fromCbor(str); }; - // 5 code units in UTF-8 - QCborValue vs4_utf16(u"Mørk"_s); - QCborValue vs4_utf8 = utf8string("Mørk"); - QCOMPARE(vs4_utf8, vs4_utf8); - QCOMPARE(vs4_utf16, vs4_utf16); - QCOMPARE(vs4_utf16, vs4_utf8); - // 5 code units in UTF-16 - QCborValue vs5_utf16(u"Først"_s); - QCborValue vs5_utf8 = utf8string("Først"); - QCOMPARE(vs5_utf8, vs5_utf8); - QCOMPARE(vs5_utf16, vs5_utf16); - QCOMPARE(vs5_utf16, vs5_utf8); + auto addStringCmp = [&](const char *, const char *, QUtf8StringView lhs, + QUtf8StringView rhs) { + // CBOR orders strings by UTF-8 length + bool is_lt = (lhs.size() < rhs.size()); + if (!is_lt) + is_lt = (lhs < rhs); - // sorted by UTF-8 length first, so "Mørk" < "World" < "Først" (!!) - CHECK_ORDER(vs4_utf8, vs3); - CHECK_ORDER(vs4_utf16, vs3); - CHECK_ORDER(vs3, vs5_utf8); - CHECK_ORDER(vs3, vs5_utf16); - CHECK_ORDER(vs4_utf8, vs5_utf8); - CHECK_ORDER(vs4_utf8, vs5_utf16); - CHECK_ORDER(vs4_utf16, vs5_utf8); - CHECK_ORDER(vs4_utf16, vs5_utf16); + QCborValue lhs_utf8 = utf8string(QByteArrayView(lhs).toByteArray()); + QCborValue rhs_utf8 = utf8string(QByteArrayView(rhs).toByteArray()); + QCborValue lhs_utf16 = QString::fromUtf8(lhs); + QCborValue rhs_utf16 = QString::fromUtf8(rhs); + + if (is_lt) { + CHECK_ORDER(lhs_utf8, rhs_utf8); + CHECK_ORDER(lhs_utf8, rhs_utf16); + CHECK_ORDER(lhs_utf16, rhs_utf8); + CHECK_ORDER(lhs_utf16, rhs_utf16); + } else { + QCOMPARE(lhs_utf8, rhs_utf8); + QCOMPARE(lhs_utf8, rhs_utf16); + QCOMPARE(lhs_utf16, rhs_utf8); + QCOMPARE(lhs_utf16, rhs_utf16); + } + }; + auto addStringCmpSameLength = [&](const char *tag, QUtf8StringView lhs, QUtf8StringView rhs) { + Q_ASSERT(lhs.size() == rhs.size()); + addStringCmp("samelength-", tag, lhs, rhs); + }; + auto addStringCmpShorter = [&](const char *tag, QUtf8StringView lhs, QUtf8StringView rhs) { + Q_ASSERT(lhs.size() < rhs.size()); + addStringCmp("shorter-", tag, lhs, rhs); + }; + + // ascii-only is already tested + addStringCmp("equal-", "1continuation", "ab\u00A0c", "ab\u00A0c"); + addStringCmp("equal-", "2continuation", "ab\u0800", "ab\u0800"); + addStringCmp("equal-", "3continuation", "a\U00010000", "a\U00010000"); + + // these strings all have the same UTF-8 length (5 bytes) + addStringCmpSameLength("less-ascii", "abcde", "ab\u00A0c"); + addStringCmpSameLength("less-1continuation", "ab\u00A0c", "ab\u07FFc"); + addStringCmpSameLength("less-2continuation", "ab\u0800", "ab\uFFFC"); + addStringCmpSameLength("less-3continuation", "a\U00010000", "a\U0010FFFC"); + addStringCmpSameLength("less-0-vs-1continuation", "abcde", "ab\u00A0c"); + addStringCmpSameLength("less-0-vs-2continuation", "abcde", "ab\u0800"); + addStringCmpSameLength("less-0-vs-3continuation", "abcde", "a\U00010000"); + addStringCmpSameLength("less-1-vs-2continuation", "ab\u00A0c", "ab\uFFFC"); + addStringCmpSameLength("less-1-vs-3continuation", "ab\u00A0c", "a\U00010000"); + addStringCmpSameLength("less-2-vs-3continuation", "ab\u0800", "a\U00010000"); + addStringCmpSameLength("less-2-vs-3continuation_surrogate", "a\uFFFCz", "a\U00010000"); // even though U+D800 < U+FFFC + + // these strings have different lengths in UTF-8 + // (0continuation already tested) + addStringCmpShorter("1continuation", "ab\u00A0", "ab\u00A0c"); + addStringCmpShorter("2continuation", "ab\u0800", "ab\u0800c"); + addStringCmpShorter("3continuation", "ab\U00010000", "ab\U00010000c"); + // most of these have the same length in UTF-16! + addStringCmpShorter("0-vs-1continuation", "abc", "ab\u00A0"); + addStringCmpShorter("0-vs-2continuation", "abcd", "ab\u0800"); + addStringCmpShorter("0-vs-3continuation", "abcde", "ab\U00010000"); + addStringCmpShorter("1-vs-2continuation", "ab\u00A0", "ab\u0800"); + addStringCmpShorter("1-vs-3continuation", "abc\u00A0", "ab\U00010000"); + addStringCmpShorter("2-vs-3continuation", "ab\u0800", "ab\U00010000"); + + // lhs is 4xUTF-16 and 8xUTF-8; rhs is 3xUTF-16 but 9xUTF-8 + addStringCmpShorter("3x2-vs-2x3continuation", "\U00010000\U00010000", "\u0800\u0800\u0800"); + + // slight surprising because normally rhs would sort first ("aa" vs "ab" prefix) + // (0continuation_surprise already tested) + addStringCmpShorter("1continuation_surprise", "ab\u00A0", "aa\u00A0c"); + addStringCmpShorter("2continuation_surprise", "ab\u0800", "aa\u0800c"); + addStringCmpShorter("3continuation_surprise", "ab\U00010000", "aa\U00010000c"); + addStringCmpShorter("0-vs-1continuation_surprise", "abc", "aa\u00A0"); + addStringCmpShorter("0-vs-2continuation_surprise", "abcd", "aa\u0800"); + addStringCmpShorter("0-vs-3continuation_surprise", "abcde", "aa\U00010000"); + addStringCmpShorter("1-vs-2continuation_surprise", "ab\u00A0", "aa\u0800"); + addStringCmpShorter("1-vs-3continuation_surprise", "abc\u00A0", "aa\U00010000"); + addStringCmpShorter("2-vs-3continuation_surprise", "ab\u0800", "aa\U00010000"); #undef CHECK_ORDER }