diff --git a/src/corelib/serialization/qcborvalue.cpp b/src/corelib/serialization/qcborvalue.cpp index 81eb13bd54f..3414cff5fc3 100644 --- a/src/corelib/serialization/qcborvalue.cpp +++ b/src/corelib/serialization/qcborvalue.cpp @@ -1094,6 +1094,71 @@ QCborValue QCborContainerPrivate::extractAt_complex(Element e) return makeValue(e.type, 0, container); } +// Similar to QStringIterator::next() but returns malformed surrogate pair +// itself when one is detected, and returns the length in UTF-8. +static auto nextUtf32Character(const char16_t *&ptr, const char16_t *end) noexcept +{ + Q_ASSERT(ptr != end); + struct R { + char32_t c; + qsizetype len = 1; // in UTF-8 code units (bytes) + } r = { *ptr++ }; + + if (r.c < 0x0800) { + if (r.c >= 0x0080) + ++r.len; + } else if (!QChar::isHighSurrogate(r.c) || ptr == end) { + r.len += 2; + } else { + r.len += 3; + r.c = QChar::surrogateToUcs4(r.c, *ptr++); + } + + return r; +} + +static qsizetype stringLengthInUtf8(const char16_t *ptr, const char16_t *end) noexcept +{ + qsizetype len = 0; + while (ptr < end) + len += nextUtf32Character(ptr, end).len; + return len; +} + +static int compareStringsInUtf8(QStringView lhs, QStringView rhs, Comparison mode) noexcept +{ + if (mode == Comparison::ForEquality) + return lhs == rhs ? 0 : 1; + + // The UTF-16 length is *usually* comparable, but not always. There are + // pathological cases where they can be wrong, so we need to compare as if + // we were doing it in UTF-8. That includes the case of UTF-16 surrogate + // pairs, because qstring.cpp sorts them before U+E000-U+FFFF. + int diff = 0; + qsizetype len1 = 0; + qsizetype len2 = 0; + const char16_t *src1 = lhs.utf16(); + const char16_t *src2 = rhs.utf16(); + const char16_t *end1 = src1 + lhs.size(); + const char16_t *end2 = src2 + rhs.size(); + + // first, scan until we find a difference (if any) + do { + auto r1 = nextUtf32Character(src1, end1); + auto r2 = nextUtf32Character(src2, end2); + len1 += r1.len; + len2 += r2.len; + diff = int(r1.c) - int(r2.c); // no underflow due to limited range + } while (src1 < end1 && src2 < end2 && diff == 0); + + // compute the full length past this first difference + len1 += stringLengthInUtf8(src1, end1); + len2 += stringLengthInUtf8(src2, end2); + if (len1 == len2) + return diff; + return len1 < len2 ? -1 : 1; +} + QT_WARNING_DISABLE_MSVC(4146) // unary minus operator applied to unsigned type, result still unsigned static int compareContainer(const QCborContainerPrivate *c1, const QCborContainerPrivate *c2, Comparison mode); @@ -1187,11 +1252,8 @@ static int compareElementRecursive(const QCborContainerPrivate *c1, const Elemen // 5) UTF-8 and US-ASCII // 6) US-ASCII and US-ASCII if ((e1.flags & Element::StringIsUtf16) && (e2.flags & Element::StringIsUtf16)) { - // Case 1: both UTF-16, so lengths are comparable. - // (we can't use memcmp in little-endian machines) - if (len1 == len2) - return compareStrings(b1->asStringView(), b2->asStringView()); - return len1 < len2 ? -1 : 1; + // Case 1: both UTF-16 + return compareStringsInUtf8(b1->asStringView(), b2->asStringView(), mode); } if (!(e1.flags & Element::StringIsUtf16) && !(e2.flags & Element::StringIsUtf16)) { diff --git a/tests/auto/corelib/serialization/qcborvalue/tst_qcborvalue.cpp b/tests/auto/corelib/serialization/qcborvalue/tst_qcborvalue.cpp index 45c3799c190..23b25834b9a 100644 --- a/tests/auto/corelib/serialization/qcborvalue/tst_qcborvalue.cpp +++ b/tests/auto/corelib/serialization/qcborvalue/tst_qcborvalue.cpp @@ -1870,6 +1870,18 @@ void tst_QCborValue::mapNested() void tst_QCborValue::sorting_data() { + // CBOR data comparisons are done as if we were comparing their canonically + // (deterministic) encoded forms in the byte stream, including the Major + // Type. That has a few surprises noted below: + // 1) because the length of a string precedes it, effectively strings are + // sorted by their UTF-8 length before their contents + // 2) because negative integers are stored in negated form, they sort in + // descending order (i.e. by absolute value) + // 3) negative integers (Major Type 1) sort after all positive integers + // (Major Type 0) + // Effectively, this means integers are sorted as sign+magnitude. + // 4) floating point types (Major Type 7) sort after all integers + QTest::addColumn("lhs"); QTest::addColumn("rhs"); QTest::addColumn("expectedOrdering"); @@ -1954,11 +1966,6 @@ void tst_QCborValue::sorting_data() addRow(vdouble1, vdouble2, Qt::strong_ordering::less); addRow(vdouble2, vdouble3, Qt::strong_ordering::less); // surprise: NaNs do compare addRow(vndouble1, vndouble2, Qt::strong_ordering::less); // surprise: -1.5 < -inf - // note: shorter length sorts first - addRow(vba1, vba2, Qt::strong_ordering::less); - addRow(vba2, vba3, Qt::strong_ordering::less); - addRow(vs1, vs2, Qt::strong_ordering::less); - addRow(vs2, vs3, Qt::strong_ordering::less); addRow(va1, va2, Qt::strong_ordering::less); addRow(va2, va3, Qt::strong_ordering::less); addRow(vm1, vm2, Qt::strong_ordering::less); @@ -1971,10 +1978,16 @@ void tst_QCborValue::sorting_data() addRow(vurl1, vurl2, Qt::strong_ordering::less); addRow(vuuid1, vuuid2, Qt::strong_ordering::less); - // surprise 1: CBOR sorts integrals by absolute value + // surprise 1: CBOR sorts strings by length first + addRow(vba1, vba2, Qt::strong_ordering::less); + addRow(vba2, vba3, Qt::strong_ordering::less); + addRow(vs1, vs2, Qt::strong_ordering::less); + addRow(vs2, vs3, Qt::strong_ordering::less); + + // surprise 2: CBOR sorts integrals by absolute value addRow(vneg1, vneg2, Qt::strong_ordering::less); - // surprise 2: CBOR sorts negatives after positives (sign+magnitude) + // surprise 3: CBOR sorts negatives after positives (sign+magnitude) addRow(vint2, vneg1, Qt::strong_ordering::less); addRow(vdouble2, vndouble1, Qt::strong_ordering::less); @@ -2008,29 +2021,79 @@ void tst_QCborValue::sorting_data() str.prepend(char(QCborValue::String) + str.size()); return QCborValue::fromCbor(str); }; - // 5 code units in UTF-8 - QCborValue vs4_utf16(u"Mørk"_s); - QCborValue vs4_utf8 = utf8string("Mørk"); - QTest::newRow("self-utf8_5b") << vs4_utf8 << vs4_utf8 << Qt::strong_ordering::equal; - QTest::newRow("self-utf16_5b") << vs4_utf16 << vs4_utf16 << Qt::strong_ordering::equal; - QTest::newRow("utf16_5b-cmp-utf8_5b") << vs4_utf16 << vs4_utf8 << Qt::strong_ordering::equal; - // 5 code units in UTF-16 - QCborValue vs5_utf16(u"Først"_s); - QCborValue vs5_utf8 = utf8string("Først"); - QTest::newRow("self-utf8_5c") << vs5_utf8 << vs5_utf8 << Qt::strong_ordering::equal; - QTest::newRow("self-utf16_5c") << vs5_utf16 << vs5_utf16 << Qt::strong_ordering::equal; - QTest::newRow("utf16_5c-cmp-utf8_5c") << vs5_utf16 << vs5_utf8 << Qt::strong_ordering::equal; + auto addStringCmp = [&](const char *prefix, const char *tag, QUtf8StringView lhs, + QUtf8StringView rhs) { + // CBOR orders strings by UTF-8 length + auto order = Qt::compareThreeWay(lhs.size(), rhs.size()); + if (is_eq(order)) + order = compareThreeWay(lhs, rhs); + Q_ASSERT(is_eq(order) || is_lt(order)); // please keep lhs <= rhs! - // sorted by UTF-8 length first, so "Mørk" < "World" < "Først" (!!) - QTest::newRow("ascii-cmp-utf8_5b") << vs3 << vs4_utf8 << Qt::strong_ordering::greater; - QTest::newRow("ascii-cmp-utf16_5b") << vs3 << vs4_utf16 << Qt::strong_ordering::greater; - QTest::newRow("ascii-cmp-utf8_5c") << vs3 << vs5_utf8 << Qt::strong_ordering::less; - QTest::newRow("ascii-cmp-utf16_5c") << vs3 << vs5_utf16 << Qt::strong_ordering::less; - QTest::newRow("utf8_5b-cmp-utf8_5c") << vs4_utf8 << vs5_utf8 << Qt::strong_ordering::less; - QTest::newRow("utf8_5b-cmp-utf16_5c") << vs4_utf8 << vs5_utf16 << Qt::strong_ordering::less; - QTest::newRow("utf16_5b-cmp-utf8_5c") << vs4_utf16 << vs5_utf8 << Qt::strong_ordering::less; - QTest::newRow("utf16_5b-cmp-utf16_5c") << vs4_utf16 << vs5_utf16 << Qt::strong_ordering::less; + QCborValue lhs_utf8 = utf8string(QByteArrayView(lhs).toByteArray()); + QCborValue rhs_utf8 = utf8string(QByteArrayView(rhs).toByteArray()); + QCborValue lhs_utf16 = QString::fromUtf8(lhs); + QCborValue rhs_utf16 = QString::fromUtf8(rhs); + + QTest::addRow("string-%s%s:utf8-utf8", prefix, tag) << lhs_utf8 << rhs_utf8 << order; + QTest::addRow("string-%s%s:utf8-utf16", prefix, tag) << lhs_utf8 << rhs_utf16 << order; + QTest::addRow("string-%s%s:utf16-utf8", prefix, tag) << lhs_utf16 << rhs_utf8 << order; + QTest::addRow("string-%s%s:utf16-utf16", prefix, tag) << lhs_utf16 << rhs_utf16 << order; + }; + auto addStringCmpSameLength = [&](const char *tag, QUtf8StringView lhs, QUtf8StringView rhs) { + Q_ASSERT(lhs.size() == rhs.size()); + addStringCmp("samelength-", tag, lhs, rhs); + }; + auto addStringCmpShorter = [&](const char *tag, QUtf8StringView lhs, QUtf8StringView rhs) { + Q_ASSERT(lhs.size() < rhs.size()); + addStringCmp("shorter-", tag, lhs, rhs); + }; + + // ascii-only is already tested + addStringCmp("equal-", "1continuation", "ab\u00A0c", "ab\u00A0c"); + addStringCmp("equal-", "2continuation", "ab\u0800", "ab\u0800"); + addStringCmp("equal-", "3continuation", "a\U00010000", "a\U00010000"); + + // these strings all have the same UTF-8 length (5 bytes) + addStringCmpSameLength("less-ascii", "abcde", "ab\u00A0c"); + addStringCmpSameLength("less-1continuation", "ab\u00A0c", "ab\u07FFc"); + addStringCmpSameLength("less-2continuation", "ab\u0800", "ab\uFFFC"); + addStringCmpSameLength("less-3continuation", "a\U00010000", "a\U0010FFFC"); + addStringCmpSameLength("less-0-vs-1continuation", "abcde", "ab\u00A0c"); + addStringCmpSameLength("less-0-vs-2continuation", "abcde", "ab\u0800"); + addStringCmpSameLength("less-0-vs-3continuation", "abcde", "a\U00010000"); + addStringCmpSameLength("less-1-vs-2continuation", "ab\u00A0c", "ab\uFFFC"); + addStringCmpSameLength("less-1-vs-3continuation", "ab\u00A0c", "a\U00010000"); + addStringCmpSameLength("less-2-vs-3continuation", "ab\u0800", "a\U00010000"); + addStringCmpSameLength("less-2-vs-3continuation_surrogate", "a\uFFFCz", "a\U00010000"); // even though U+D800 < U+FFFC + + // these strings have different lengths in UTF-8 + // (0continuation already tested) + addStringCmpShorter("1continuation", "ab\u00A0", "ab\u00A0c"); + addStringCmpShorter("2continuation", "ab\u0800", "ab\u0800c"); + addStringCmpShorter("3continuation", "ab\U00010000", "ab\U00010000c"); + // most of these have the same length in UTF-16! + addStringCmpShorter("0-vs-1continuation", "abc", "ab\u00A0"); + addStringCmpShorter("0-vs-2continuation", "abcd", "ab\u0800"); + addStringCmpShorter("0-vs-3continuation", "abcde", "ab\U00010000"); + addStringCmpShorter("1-vs-2continuation", "ab\u00A0", "ab\u0800"); + addStringCmpShorter("1-vs-3continuation", "abc\u00A0", "ab\U00010000"); + addStringCmpShorter("2-vs-3continuation", "ab\u0800", "ab\U00010000"); + + // lhs is 4xUTF-16 and 8xUTF-8; rhs is 3xUTF-16 but 9xUTF-8 + addStringCmpShorter("3x2-vs-2x3continuation", "\U00010000\U00010000", "\u0800\u0800\u0800"); + + // slight surprising because normally rhs would sort first ("aa" vs "ab" prefix) + // (0continuation_surprise already tested) + addStringCmpShorter("1continuation_surprise", "ab\u00A0", "aa\u00A0c"); + addStringCmpShorter("2continuation_surprise", "ab\u0800", "aa\u0800c"); + addStringCmpShorter("3continuation_surprise", "ab\U00010000", "aa\U00010000c"); + addStringCmpShorter("0-vs-1continuation_surprise", "abc", "aa\u00A0"); + addStringCmpShorter("0-vs-2continuation_surprise", "abcd", "aa\u0800"); + addStringCmpShorter("0-vs-3continuation_surprise", "abcde", "aa\U00010000"); + addStringCmpShorter("1-vs-2continuation_surprise", "ab\u00A0", "aa\u0800"); + addStringCmpShorter("1-vs-3continuation_surprise", "abc\u00A0", "aa\U00010000"); + addStringCmpShorter("2-vs-3continuation_surprise", "ab\u0800", "aa\U00010000"); } void tst_QCborValue::sorting()