CBOR: fix sorting of UTF16-to-UTF16 strings
This amends commit 394788c68efacdec2676988b4b4ff207b20557f2 (its ChangeLog applies to this commit too). That fixed sorting of UTF8-to- UTF16, but when adding more unit tests, I've discovered that some UTF-16 strings also sorted incorrectly. There were two problems: First, we were assuming that we could rely on the UTF-16 length as a proxy for the UTF-8 one, but that's not true for some cases: * both 1-, 2- and 3-codepoint UTF-8 sequences are 1 codepoint in UTF-16, so some strings would have identical UTF-16 length * 4-codepoint UTF-8 sequences shrink to 2-codepoint UTF-16 ones (2:1) but 3-codepoint UTF-8 sequences shrink to 1 (3:1), so some strings would be longer in UTF-16 but shorter in UTF-8. Second, QtPrivate::compareStrings performs UTF-16 codepoint comparisons not Unicode character ones, so surrogate pairs were sorting before U+E000 to U+FFFF. To fix all of this, we need to decode the UTF-16 string into UTF-32 and calculate the length of that in UTF-8 to be sure we have the sorting order right. Since this is a slight behavior change with a performance penalty, I am choosing to backport only to 6.7. The penalty mostly does not apply to 6.8 due to commit 61556627f25e7c7acbfcc5e54127a392b5239977. Change-Id: If1bf59ecbe014b569ba1fffd17c4c4ddcc874aac Reviewed-by: Ivan Solovev <ivan.solovev@qt.io> (cherry picked from commit d4c7da9a07dc1434692fe08a61ba22c794574c4f) Reviewed-by: Thiago Macieira <thiago.macieira@intel.com>
This commit is contained in:
parent
82b2436d4c
commit
8764a0c79d
@ -1087,6 +1087,68 @@ QCborValue QCborContainerPrivate::extractAt_complex(Element e)
|
|||||||
return makeValue(e.type, 0, container);
|
return makeValue(e.type, 0, container);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Similar to QStringIterator::next() but returns malformed surrogate pair
|
||||||
|
// itself when one is detected, and returns the length in UTF-8.
|
||||||
|
static auto nextUtf32Character(const char16_t *&ptr, const char16_t *end) noexcept
|
||||||
|
{
|
||||||
|
Q_ASSERT(ptr != end);
|
||||||
|
struct R {
|
||||||
|
char32_t c;
|
||||||
|
qsizetype len = 1; // in UTF-8 code units (bytes)
|
||||||
|
} r = { *ptr++ };
|
||||||
|
|
||||||
|
if (r.c < 0x0800) {
|
||||||
|
if (r.c >= 0x0080)
|
||||||
|
++r.len;
|
||||||
|
} else if (!QChar::isHighSurrogate(r.c) || ptr == end) {
|
||||||
|
r.len += 2;
|
||||||
|
} else {
|
||||||
|
r.len += 3;
|
||||||
|
r.c = QChar::surrogateToUcs4(r.c, *ptr++);
|
||||||
|
}
|
||||||
|
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
|
||||||
|
static qsizetype stringLengthInUtf8(const char16_t *ptr, const char16_t *end) noexcept
|
||||||
|
{
|
||||||
|
qsizetype len = 0;
|
||||||
|
while (ptr < end)
|
||||||
|
len += nextUtf32Character(ptr, end).len;
|
||||||
|
return len;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int compareStringsInUtf8(QStringView lhs, QStringView rhs) noexcept
|
||||||
|
{
|
||||||
|
// The UTF-16 length is *usually* comparable, but not always. There are
|
||||||
|
// pathological cases where they can be wrong, so we need to compare as if
|
||||||
|
// we were doing it in UTF-8. That includes the case of UTF-16 surrogate
|
||||||
|
// pairs, because qstring.cpp sorts them before U+E000-U+FFFF.
|
||||||
|
int diff = 0;
|
||||||
|
qsizetype len1 = 0;
|
||||||
|
qsizetype len2 = 0;
|
||||||
|
const char16_t *src1 = lhs.utf16();
|
||||||
|
const char16_t *src2 = rhs.utf16();
|
||||||
|
const char16_t *end1 = src1 + lhs.size();
|
||||||
|
const char16_t *end2 = src2 + rhs.size();
|
||||||
|
|
||||||
|
// first, scan until we find a difference (if any)
|
||||||
|
do {
|
||||||
|
auto r1 = nextUtf32Character(src1, end1);
|
||||||
|
auto r2 = nextUtf32Character(src2, end2);
|
||||||
|
len1 += r1.len;
|
||||||
|
len2 += r2.len;
|
||||||
|
diff = int(r1.c) - int(r2.c); // no underflow due to limited range
|
||||||
|
} while (src1 < end1 && src2 < end2 && diff == 0);
|
||||||
|
|
||||||
|
// compute the full length past this first difference
|
||||||
|
len1 += stringLengthInUtf8(src1, end1);
|
||||||
|
len2 += stringLengthInUtf8(src2, end2);
|
||||||
|
if (len1 == len2)
|
||||||
|
return diff;
|
||||||
|
return len1 < len2 ? -1 : 1;
|
||||||
|
}
|
||||||
|
|
||||||
QT_WARNING_DISABLE_MSVC(4146) // unary minus operator applied to unsigned type, result still unsigned
|
QT_WARNING_DISABLE_MSVC(4146) // unary minus operator applied to unsigned type, result still unsigned
|
||||||
static int compareContainer(const QCborContainerPrivate *c1, const QCborContainerPrivate *c2);
|
static int compareContainer(const QCborContainerPrivate *c1, const QCborContainerPrivate *c2);
|
||||||
static int compareElementNoData(const Element &e1, const Element &e2)
|
static int compareElementNoData(const Element &e1, const Element &e2)
|
||||||
@ -1167,11 +1229,8 @@ static int compareElementRecursive(const QCborContainerPrivate *c1, const Elemen
|
|||||||
// 5) UTF-8 and US-ASCII
|
// 5) UTF-8 and US-ASCII
|
||||||
// 6) US-ASCII and US-ASCII
|
// 6) US-ASCII and US-ASCII
|
||||||
if ((e1.flags & Element::StringIsUtf16) && (e2.flags & Element::StringIsUtf16)) {
|
if ((e1.flags & Element::StringIsUtf16) && (e2.flags & Element::StringIsUtf16)) {
|
||||||
// Case 1: both UTF-16, so lengths are comparable.
|
// Case 1: both UTF-16
|
||||||
// (we can't use memcmp in little-endian machines)
|
return compareStringsInUtf8(b1->asStringView(), b2->asStringView());
|
||||||
if (len1 == len2)
|
|
||||||
return QtPrivate::compareStrings(b1->asStringView(), b2->asStringView());
|
|
||||||
return len1 < len2 ? -1 : 1;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!(e1.flags & Element::StringIsUtf16) && !(e2.flags & Element::StringIsUtf16)) {
|
if (!(e1.flags & Element::StringIsUtf16) && !(e2.flags & Element::StringIsUtf16)) {
|
||||||
|
@ -1798,6 +1798,18 @@ void tst_QCborValue::mapNested()
|
|||||||
|
|
||||||
void tst_QCborValue::sorting()
|
void tst_QCborValue::sorting()
|
||||||
{
|
{
|
||||||
|
// CBOR data comparisons are done as if we were comparing their canonically
|
||||||
|
// (deterministic) encoded forms in the byte stream, including the Major
|
||||||
|
// Type. That has a few surprises noted below:
|
||||||
|
// 1) because the length of a string precedes it, effectively strings are
|
||||||
|
// sorted by their UTF-8 length before their contents
|
||||||
|
// 2) because negative integers are stored in negated form, they sort in
|
||||||
|
// descending order (i.e. by absolute value)
|
||||||
|
// 3) negative integers (Major Type 1) sort after all positive integers
|
||||||
|
// (Major Type 0)
|
||||||
|
// Effectively, this means integers are sorted as sign+magnitude.
|
||||||
|
// 4) floating point types (Major Type 7) sort after all integers
|
||||||
|
|
||||||
QCborValue vundef, vnull(nullptr);
|
QCborValue vundef, vnull(nullptr);
|
||||||
QCborValue vtrue(true), vfalse(false);
|
QCborValue vtrue(true), vfalse(false);
|
||||||
QCborValue vint1(1), vint2(2);
|
QCborValue vint1(1), vint2(2);
|
||||||
@ -1830,7 +1842,6 @@ void tst_QCborValue::sorting()
|
|||||||
CHECK_ORDER(vint1, vint2);
|
CHECK_ORDER(vint1, vint2);
|
||||||
CHECK_ORDER(vdouble1, vdouble2);
|
CHECK_ORDER(vdouble1, vdouble2);
|
||||||
CHECK_ORDER(vndouble1, vndouble2);
|
CHECK_ORDER(vndouble1, vndouble2);
|
||||||
// note: shorter length sorts first
|
|
||||||
CHECK_ORDER(vba1, vba2);
|
CHECK_ORDER(vba1, vba2);
|
||||||
CHECK_ORDER(vba2, vba3);
|
CHECK_ORDER(vba2, vba3);
|
||||||
CHECK_ORDER(vs1, vs2);
|
CHECK_ORDER(vs1, vs2);
|
||||||
@ -1887,29 +1898,85 @@ void tst_QCborValue::sorting()
|
|||||||
str.prepend(char(QCborValue::String) + str.size());
|
str.prepend(char(QCborValue::String) + str.size());
|
||||||
return QCborValue::fromCbor(str);
|
return QCborValue::fromCbor(str);
|
||||||
};
|
};
|
||||||
// 5 code units in UTF-8
|
|
||||||
QCborValue vs4_utf16(u"Mørk"_s);
|
|
||||||
QCborValue vs4_utf8 = utf8string("Mørk");
|
|
||||||
QCOMPARE(vs4_utf8, vs4_utf8);
|
|
||||||
QCOMPARE(vs4_utf16, vs4_utf16);
|
|
||||||
QCOMPARE(vs4_utf16, vs4_utf8);
|
|
||||||
|
|
||||||
// 5 code units in UTF-16
|
auto addStringCmp = [&](const char *, const char *, QUtf8StringView lhs,
|
||||||
QCborValue vs5_utf16(u"Først"_s);
|
QUtf8StringView rhs) {
|
||||||
QCborValue vs5_utf8 = utf8string("Først");
|
// CBOR orders strings by UTF-8 length
|
||||||
QCOMPARE(vs5_utf8, vs5_utf8);
|
bool is_lt = (lhs.size() < rhs.size());
|
||||||
QCOMPARE(vs5_utf16, vs5_utf16);
|
if (!is_lt)
|
||||||
QCOMPARE(vs5_utf16, vs5_utf8);
|
is_lt = (lhs < rhs);
|
||||||
|
|
||||||
// sorted by UTF-8 length first, so "Mørk" < "World" < "Først" (!!)
|
QCborValue lhs_utf8 = utf8string(QByteArrayView(lhs).toByteArray());
|
||||||
CHECK_ORDER(vs4_utf8, vs3);
|
QCborValue rhs_utf8 = utf8string(QByteArrayView(rhs).toByteArray());
|
||||||
CHECK_ORDER(vs4_utf16, vs3);
|
QCborValue lhs_utf16 = QString::fromUtf8(lhs);
|
||||||
CHECK_ORDER(vs3, vs5_utf8);
|
QCborValue rhs_utf16 = QString::fromUtf8(rhs);
|
||||||
CHECK_ORDER(vs3, vs5_utf16);
|
|
||||||
CHECK_ORDER(vs4_utf8, vs5_utf8);
|
if (is_lt) {
|
||||||
CHECK_ORDER(vs4_utf8, vs5_utf16);
|
CHECK_ORDER(lhs_utf8, rhs_utf8);
|
||||||
CHECK_ORDER(vs4_utf16, vs5_utf8);
|
CHECK_ORDER(lhs_utf8, rhs_utf16);
|
||||||
CHECK_ORDER(vs4_utf16, vs5_utf16);
|
CHECK_ORDER(lhs_utf16, rhs_utf8);
|
||||||
|
CHECK_ORDER(lhs_utf16, rhs_utf16);
|
||||||
|
} else {
|
||||||
|
QCOMPARE(lhs_utf8, rhs_utf8);
|
||||||
|
QCOMPARE(lhs_utf8, rhs_utf16);
|
||||||
|
QCOMPARE(lhs_utf16, rhs_utf8);
|
||||||
|
QCOMPARE(lhs_utf16, rhs_utf16);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
auto addStringCmpSameLength = [&](const char *tag, QUtf8StringView lhs, QUtf8StringView rhs) {
|
||||||
|
Q_ASSERT(lhs.size() == rhs.size());
|
||||||
|
addStringCmp("samelength-", tag, lhs, rhs);
|
||||||
|
};
|
||||||
|
auto addStringCmpShorter = [&](const char *tag, QUtf8StringView lhs, QUtf8StringView rhs) {
|
||||||
|
Q_ASSERT(lhs.size() < rhs.size());
|
||||||
|
addStringCmp("shorter-", tag, lhs, rhs);
|
||||||
|
};
|
||||||
|
|
||||||
|
// ascii-only is already tested
|
||||||
|
addStringCmp("equal-", "1continuation", "ab\u00A0c", "ab\u00A0c");
|
||||||
|
addStringCmp("equal-", "2continuation", "ab\u0800", "ab\u0800");
|
||||||
|
addStringCmp("equal-", "3continuation", "a\U00010000", "a\U00010000");
|
||||||
|
|
||||||
|
// these strings all have the same UTF-8 length (5 bytes)
|
||||||
|
addStringCmpSameLength("less-ascii", "abcde", "ab\u00A0c");
|
||||||
|
addStringCmpSameLength("less-1continuation", "ab\u00A0c", "ab\u07FFc");
|
||||||
|
addStringCmpSameLength("less-2continuation", "ab\u0800", "ab\uFFFC");
|
||||||
|
addStringCmpSameLength("less-3continuation", "a\U00010000", "a\U0010FFFC");
|
||||||
|
addStringCmpSameLength("less-0-vs-1continuation", "abcde", "ab\u00A0c");
|
||||||
|
addStringCmpSameLength("less-0-vs-2continuation", "abcde", "ab\u0800");
|
||||||
|
addStringCmpSameLength("less-0-vs-3continuation", "abcde", "a\U00010000");
|
||||||
|
addStringCmpSameLength("less-1-vs-2continuation", "ab\u00A0c", "ab\uFFFC");
|
||||||
|
addStringCmpSameLength("less-1-vs-3continuation", "ab\u00A0c", "a\U00010000");
|
||||||
|
addStringCmpSameLength("less-2-vs-3continuation", "ab\u0800", "a\U00010000");
|
||||||
|
addStringCmpSameLength("less-2-vs-3continuation_surrogate", "a\uFFFCz", "a\U00010000"); // even though U+D800 < U+FFFC
|
||||||
|
|
||||||
|
// these strings have different lengths in UTF-8
|
||||||
|
// (0continuation already tested)
|
||||||
|
addStringCmpShorter("1continuation", "ab\u00A0", "ab\u00A0c");
|
||||||
|
addStringCmpShorter("2continuation", "ab\u0800", "ab\u0800c");
|
||||||
|
addStringCmpShorter("3continuation", "ab\U00010000", "ab\U00010000c");
|
||||||
|
// most of these have the same length in UTF-16!
|
||||||
|
addStringCmpShorter("0-vs-1continuation", "abc", "ab\u00A0");
|
||||||
|
addStringCmpShorter("0-vs-2continuation", "abcd", "ab\u0800");
|
||||||
|
addStringCmpShorter("0-vs-3continuation", "abcde", "ab\U00010000");
|
||||||
|
addStringCmpShorter("1-vs-2continuation", "ab\u00A0", "ab\u0800");
|
||||||
|
addStringCmpShorter("1-vs-3continuation", "abc\u00A0", "ab\U00010000");
|
||||||
|
addStringCmpShorter("2-vs-3continuation", "ab\u0800", "ab\U00010000");
|
||||||
|
|
||||||
|
// lhs is 4xUTF-16 and 8xUTF-8; rhs is 3xUTF-16 but 9xUTF-8
|
||||||
|
addStringCmpShorter("3x2-vs-2x3continuation", "\U00010000\U00010000", "\u0800\u0800\u0800");
|
||||||
|
|
||||||
|
// slight surprising because normally rhs would sort first ("aa" vs "ab" prefix)
|
||||||
|
// (0continuation_surprise already tested)
|
||||||
|
addStringCmpShorter("1continuation_surprise", "ab\u00A0", "aa\u00A0c");
|
||||||
|
addStringCmpShorter("2continuation_surprise", "ab\u0800", "aa\u0800c");
|
||||||
|
addStringCmpShorter("3continuation_surprise", "ab\U00010000", "aa\U00010000c");
|
||||||
|
addStringCmpShorter("0-vs-1continuation_surprise", "abc", "aa\u00A0");
|
||||||
|
addStringCmpShorter("0-vs-2continuation_surprise", "abcd", "aa\u0800");
|
||||||
|
addStringCmpShorter("0-vs-3continuation_surprise", "abcde", "aa\U00010000");
|
||||||
|
addStringCmpShorter("1-vs-2continuation_surprise", "ab\u00A0", "aa\u0800");
|
||||||
|
addStringCmpShorter("1-vs-3continuation_surprise", "abc\u00A0", "aa\U00010000");
|
||||||
|
addStringCmpShorter("2-vs-3continuation_surprise", "ab\u0800", "aa\U00010000");
|
||||||
#undef CHECK_ORDER
|
#undef CHECK_ORDER
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user