diff --git a/src/corelib/serialization/qcborvalue.cpp b/src/corelib/serialization/qcborvalue.cpp index e415343008c..15b5a1af4c1 100644 --- a/src/corelib/serialization/qcborvalue.cpp +++ b/src/corelib/serialization/qcborvalue.cpp @@ -1087,6 +1087,68 @@ QCborValue QCborContainerPrivate::extractAt_complex(Element e) return makeValue(e.type, 0, container); } +// Similar to QStringIterator::next() but returns malformed surrogate pair +// itself when one is detected, and returns the length in UTF-8. +static auto nextUtf32Character(const char16_t *&ptr, const char16_t *end) noexcept +{ + Q_ASSERT(ptr != end); + struct R { + char32_t c; + qsizetype len = 1; // in UTF-8 code units (bytes) + } r = { *ptr++ }; + + if (r.c < 0x0800) { + if (r.c >= 0x0080) + ++r.len; + } else if (!QChar::isHighSurrogate(r.c) || ptr == end) { + r.len += 2; + } else { + r.len += 3; + r.c = QChar::surrogateToUcs4(r.c, *ptr++); + } + + return r; +} + +static qsizetype stringLengthInUtf8(const char16_t *ptr, const char16_t *end) noexcept +{ + qsizetype len = 0; + while (ptr < end) + len += nextUtf32Character(ptr, end).len; + return len; +} + +static int compareStringsInUtf8(QStringView lhs, QStringView rhs) noexcept +{ + // The UTF-16 length is *usually* comparable, but not always. There are + // pathological cases where they can be wrong, so we need to compare as if + // we were doing it in UTF-8. That includes the case of UTF-16 surrogate + // pairs, because qstring.cpp sorts them before U+E000-U+FFFF. + int diff = 0; + qsizetype len1 = 0; + qsizetype len2 = 0; + const char16_t *src1 = lhs.utf16(); + const char16_t *src2 = rhs.utf16(); + const char16_t *end1 = src1 + lhs.size(); + const char16_t *end2 = src2 + rhs.size(); + + // first, scan until we find a difference (if any) + do { + auto r1 = nextUtf32Character(src1, end1); + auto r2 = nextUtf32Character(src2, end2); + len1 += r1.len; + len2 += r2.len; + diff = int(r1.c) - int(r2.c); // no underflow due to limited range + } while (src1 < end1 && src2 < end2 && diff == 0); + + // compute the full length past this first difference + len1 += stringLengthInUtf8(src1, end1); + len2 += stringLengthInUtf8(src2, end2); + if (len1 == len2) + return diff; + return len1 < len2 ? -1 : 1; +} + QT_WARNING_DISABLE_MSVC(4146) // unary minus operator applied to unsigned type, result still unsigned static int compareContainer(const QCborContainerPrivate *c1, const QCborContainerPrivate *c2); static int compareElementNoData(const Element &e1, const Element &e2) @@ -1167,11 +1229,8 @@ static int compareElementRecursive(const QCborContainerPrivate *c1, const Elemen // 5) UTF-8 and US-ASCII // 6) US-ASCII and US-ASCII if ((e1.flags & Element::StringIsUtf16) && (e2.flags & Element::StringIsUtf16)) { - // Case 1: both UTF-16, so lengths are comparable. - // (we can't use memcmp in little-endian machines) - if (len1 == len2) - return QtPrivate::compareStrings(b1->asStringView(), b2->asStringView()); - return len1 < len2 ? -1 : 1; + // Case 1: both UTF-16 + return compareStringsInUtf8(b1->asStringView(), b2->asStringView()); } if (!(e1.flags & Element::StringIsUtf16) && !(e2.flags & Element::StringIsUtf16)) { diff --git a/tests/auto/corelib/serialization/qcborvalue/tst_qcborvalue.cpp b/tests/auto/corelib/serialization/qcborvalue/tst_qcborvalue.cpp index ca95db9fd60..8ea6495d973 100644 --- a/tests/auto/corelib/serialization/qcborvalue/tst_qcborvalue.cpp +++ b/tests/auto/corelib/serialization/qcborvalue/tst_qcborvalue.cpp @@ -1798,6 +1798,18 @@ void tst_QCborValue::mapNested() void tst_QCborValue::sorting() { + // CBOR data comparisons are done as if we were comparing their canonically + // (deterministic) encoded forms in the byte stream, including the Major + // Type. That has a few surprises noted below: + // 1) because the length of a string precedes it, effectively strings are + // sorted by their UTF-8 length before their contents + // 2) because negative integers are stored in negated form, they sort in + // descending order (i.e. by absolute value) + // 3) negative integers (Major Type 1) sort after all positive integers + // (Major Type 0) + // Effectively, this means integers are sorted as sign+magnitude. + // 4) floating point types (Major Type 7) sort after all integers + QCborValue vundef, vnull(nullptr); QCborValue vtrue(true), vfalse(false); QCborValue vint1(1), vint2(2); @@ -1830,7 +1842,6 @@ void tst_QCborValue::sorting() CHECK_ORDER(vint1, vint2); CHECK_ORDER(vdouble1, vdouble2); CHECK_ORDER(vndouble1, vndouble2); - // note: shorter length sorts first CHECK_ORDER(vba1, vba2); CHECK_ORDER(vba2, vba3); CHECK_ORDER(vs1, vs2); @@ -1887,29 +1898,85 @@ void tst_QCborValue::sorting() str.prepend(char(QCborValue::String) + str.size()); return QCborValue::fromCbor(str); }; - // 5 code units in UTF-8 - QCborValue vs4_utf16(u"Mørk"_s); - QCborValue vs4_utf8 = utf8string("Mørk"); - QCOMPARE(vs4_utf8, vs4_utf8); - QCOMPARE(vs4_utf16, vs4_utf16); - QCOMPARE(vs4_utf16, vs4_utf8); - // 5 code units in UTF-16 - QCborValue vs5_utf16(u"Først"_s); - QCborValue vs5_utf8 = utf8string("Først"); - QCOMPARE(vs5_utf8, vs5_utf8); - QCOMPARE(vs5_utf16, vs5_utf16); - QCOMPARE(vs5_utf16, vs5_utf8); + auto addStringCmp = [&](const char *, const char *, QUtf8StringView lhs, + QUtf8StringView rhs) { + // CBOR orders strings by UTF-8 length + bool is_lt = (lhs.size() < rhs.size()); + if (!is_lt) + is_lt = (lhs < rhs); - // sorted by UTF-8 length first, so "Mørk" < "World" < "Først" (!!) - CHECK_ORDER(vs4_utf8, vs3); - CHECK_ORDER(vs4_utf16, vs3); - CHECK_ORDER(vs3, vs5_utf8); - CHECK_ORDER(vs3, vs5_utf16); - CHECK_ORDER(vs4_utf8, vs5_utf8); - CHECK_ORDER(vs4_utf8, vs5_utf16); - CHECK_ORDER(vs4_utf16, vs5_utf8); - CHECK_ORDER(vs4_utf16, vs5_utf16); + QCborValue lhs_utf8 = utf8string(QByteArrayView(lhs).toByteArray()); + QCborValue rhs_utf8 = utf8string(QByteArrayView(rhs).toByteArray()); + QCborValue lhs_utf16 = QString::fromUtf8(lhs); + QCborValue rhs_utf16 = QString::fromUtf8(rhs); + + if (is_lt) { + CHECK_ORDER(lhs_utf8, rhs_utf8); + CHECK_ORDER(lhs_utf8, rhs_utf16); + CHECK_ORDER(lhs_utf16, rhs_utf8); + CHECK_ORDER(lhs_utf16, rhs_utf16); + } else { + QCOMPARE(lhs_utf8, rhs_utf8); + QCOMPARE(lhs_utf8, rhs_utf16); + QCOMPARE(lhs_utf16, rhs_utf8); + QCOMPARE(lhs_utf16, rhs_utf16); + } + }; + auto addStringCmpSameLength = [&](const char *tag, QUtf8StringView lhs, QUtf8StringView rhs) { + Q_ASSERT(lhs.size() == rhs.size()); + addStringCmp("samelength-", tag, lhs, rhs); + }; + auto addStringCmpShorter = [&](const char *tag, QUtf8StringView lhs, QUtf8StringView rhs) { + Q_ASSERT(lhs.size() < rhs.size()); + addStringCmp("shorter-", tag, lhs, rhs); + }; + + // ascii-only is already tested + addStringCmp("equal-", "1continuation", "ab\u00A0c", "ab\u00A0c"); + addStringCmp("equal-", "2continuation", "ab\u0800", "ab\u0800"); + addStringCmp("equal-", "3continuation", "a\U00010000", "a\U00010000"); + + // these strings all have the same UTF-8 length (5 bytes) + addStringCmpSameLength("less-ascii", "abcde", "ab\u00A0c"); + addStringCmpSameLength("less-1continuation", "ab\u00A0c", "ab\u07FFc"); + addStringCmpSameLength("less-2continuation", "ab\u0800", "ab\uFFFC"); + addStringCmpSameLength("less-3continuation", "a\U00010000", "a\U0010FFFC"); + addStringCmpSameLength("less-0-vs-1continuation", "abcde", "ab\u00A0c"); + addStringCmpSameLength("less-0-vs-2continuation", "abcde", "ab\u0800"); + addStringCmpSameLength("less-0-vs-3continuation", "abcde", "a\U00010000"); + addStringCmpSameLength("less-1-vs-2continuation", "ab\u00A0c", "ab\uFFFC"); + addStringCmpSameLength("less-1-vs-3continuation", "ab\u00A0c", "a\U00010000"); + addStringCmpSameLength("less-2-vs-3continuation", "ab\u0800", "a\U00010000"); + addStringCmpSameLength("less-2-vs-3continuation_surrogate", "a\uFFFCz", "a\U00010000"); // even though U+D800 < U+FFFC + + // these strings have different lengths in UTF-8 + // (0continuation already tested) + addStringCmpShorter("1continuation", "ab\u00A0", "ab\u00A0c"); + addStringCmpShorter("2continuation", "ab\u0800", "ab\u0800c"); + addStringCmpShorter("3continuation", "ab\U00010000", "ab\U00010000c"); + // most of these have the same length in UTF-16! + addStringCmpShorter("0-vs-1continuation", "abc", "ab\u00A0"); + addStringCmpShorter("0-vs-2continuation", "abcd", "ab\u0800"); + addStringCmpShorter("0-vs-3continuation", "abcde", "ab\U00010000"); + addStringCmpShorter("1-vs-2continuation", "ab\u00A0", "ab\u0800"); + addStringCmpShorter("1-vs-3continuation", "abc\u00A0", "ab\U00010000"); + addStringCmpShorter("2-vs-3continuation", "ab\u0800", "ab\U00010000"); + + // lhs is 4xUTF-16 and 8xUTF-8; rhs is 3xUTF-16 but 9xUTF-8 + addStringCmpShorter("3x2-vs-2x3continuation", "\U00010000\U00010000", "\u0800\u0800\u0800"); + + // slight surprising because normally rhs would sort first ("aa" vs "ab" prefix) + // (0continuation_surprise already tested) + addStringCmpShorter("1continuation_surprise", "ab\u00A0", "aa\u00A0c"); + addStringCmpShorter("2continuation_surprise", "ab\u0800", "aa\u0800c"); + addStringCmpShorter("3continuation_surprise", "ab\U00010000", "aa\U00010000c"); + addStringCmpShorter("0-vs-1continuation_surprise", "abc", "aa\u00A0"); + addStringCmpShorter("0-vs-2continuation_surprise", "abcd", "aa\u0800"); + addStringCmpShorter("0-vs-3continuation_surprise", "abcde", "aa\U00010000"); + addStringCmpShorter("1-vs-2continuation_surprise", "ab\u00A0", "aa\u0800"); + addStringCmpShorter("1-vs-3continuation_surprise", "abc\u00A0", "aa\U00010000"); + addStringCmpShorter("2-vs-3continuation_surprise", "ab\u0800", "aa\U00010000"); #undef CHECK_ORDER }