Limit QByteArray's 8-bit support to ASCII

Previously it handled Latin-1, which made it incompatible with UTF-8,
which is now our preferred 8-bit encoding. For Qt6 it is limited to
ASCII. Adjusted tests to match. QLatin1String::compare() turned out
to be relying on qstrnicmp()'s Latin-1 handling.

Removed some spurious Q_UNLIKELY()s and tidied up code a little in the
process.

[ChangeLog][QtCore][Important Behavior Changes] Encoding-dependent
features of QByteArrray are now limited to ASCII, where previously
they worked for the whole of Latin-1. This affects case-insensitive
comparison, notably including qstricmp() and qstrnicmp(), and
case-transforming functions.

Fixes: QTBUG-84323
Change-Id: I2925d9908f8654599195a2860847b17083911b41
Reviewed-by: Lars Knoll <lars.knoll@qt.io>
Reviewed-by: Qt CI Bot <qt_ci_bot@qt-project.org>
This commit is contained in:
Edward Welbourne 2020-05-29 13:12:28 +02:00
parent 135204bdf6
commit 9dd8e655cd
4 changed files with 164 additions and 182 deletions

View File

@ -1,6 +1,6 @@
/**************************************************************************** /****************************************************************************
** **
** Copyright (C) 2019 The Qt Company Ltd. ** Copyright (C) 2020 The Qt Company Ltd.
** Copyright (C) 2016 Intel Corporation. ** Copyright (C) 2016 Intel Corporation.
** Copyright (C) 2019 Klarälvdalens Datakonsult AB, a KDAB Group company, info@kdab.com, author Giuseppe D'Angelo <giuseppe.dangelo@kdab.com> ** Copyright (C) 2019 Klarälvdalens Datakonsult AB, a KDAB Group company, info@kdab.com, author Giuseppe D'Angelo <giuseppe.dangelo@kdab.com>
** Contact: https://www.qt.io/licensing/ ** Contact: https://www.qt.io/licensing/
@ -69,64 +69,16 @@
QT_BEGIN_NAMESPACE QT_BEGIN_NAMESPACE
// Latin 1 case system, used by QByteArray::to{Upper,Lower}() and qstr(n)icmp(): // ASCII case system, used by QByteArray::to{Upper,Lower}() and qstr(n)icmp():
/* static constexpr inline uchar asciiUpper(uchar c)
#!/usr/bin/perl -l {
use feature "unicode_strings"; return c >= 'a' && c <= 'z' ? c & ~0x20 : c;
for (0..255) {
$up = uc(chr($_));
$up = chr($_) if ord($up) > 0x100 || length $up > 1;
printf "0x%02x,", ord($up);
print "" if ($_ & 0xf) == 0xf;
} }
*/
static const uchar latin1_uppercased[256] = {
0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,
0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,0x18,0x19,0x1a,0x1b,0x1c,0x1d,0x1e,0x1f,
0x20,0x21,0x22,0x23,0x24,0x25,0x26,0x27,0x28,0x29,0x2a,0x2b,0x2c,0x2d,0x2e,0x2f,
0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,0x38,0x39,0x3a,0x3b,0x3c,0x3d,0x3e,0x3f,
0x40,0x41,0x42,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0x4a,0x4b,0x4c,0x4d,0x4e,0x4f,
0x50,0x51,0x52,0x53,0x54,0x55,0x56,0x57,0x58,0x59,0x5a,0x5b,0x5c,0x5d,0x5e,0x5f,
0x60,0x41,0x42,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0x4a,0x4b,0x4c,0x4d,0x4e,0x4f,
0x50,0x51,0x52,0x53,0x54,0x55,0x56,0x57,0x58,0x59,0x5a,0x7b,0x7c,0x7d,0x7e,0x7f,
0x80,0x81,0x82,0x83,0x84,0x85,0x86,0x87,0x88,0x89,0x8a,0x8b,0x8c,0x8d,0x8e,0x8f,
0x90,0x91,0x92,0x93,0x94,0x95,0x96,0x97,0x98,0x99,0x9a,0x9b,0x9c,0x9d,0x9e,0x9f,
0xa0,0xa1,0xa2,0xa3,0xa4,0xa5,0xa6,0xa7,0xa8,0xa9,0xaa,0xab,0xac,0xad,0xae,0xaf,
0xb0,0xb1,0xb2,0xb3,0xb4,0xb5,0xb6,0xb7,0xb8,0xb9,0xba,0xbb,0xbc,0xbd,0xbe,0xbf,
0xc0,0xc1,0xc2,0xc3,0xc4,0xc5,0xc6,0xc7,0xc8,0xc9,0xca,0xcb,0xcc,0xcd,0xce,0xcf,
0xd0,0xd1,0xd2,0xd3,0xd4,0xd5,0xd6,0xd7,0xd8,0xd9,0xda,0xdb,0xdc,0xdd,0xde,0xdf,
0xc0,0xc1,0xc2,0xc3,0xc4,0xc5,0xc6,0xc7,0xc8,0xc9,0xca,0xcb,0xcc,0xcd,0xce,0xcf,
0xd0,0xd1,0xd2,0xd3,0xd4,0xd5,0xd6,0xf7,0xd8,0xd9,0xda,0xdb,0xdc,0xdd,0xde,0xff
};
/* static constexpr inline uchar asciiLower(uchar c)
#!/usr/bin/perl -l {
use feature "unicode_strings"; return c >= 'A' && c <= 'Z' ? c | 0x20 : c;
for (0..255) {
$up = lc(chr($_));
$up = chr($_) if ord($up) > 0x100 || length $up > 1;
printf "0x%02x,", ord($up);
print "" if ($_ & 0xf) == 0xf;
} }
*/
static const uchar latin1_lowercased[256] = {
0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,
0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,0x18,0x19,0x1a,0x1b,0x1c,0x1d,0x1e,0x1f,
0x20,0x21,0x22,0x23,0x24,0x25,0x26,0x27,0x28,0x29,0x2a,0x2b,0x2c,0x2d,0x2e,0x2f,
0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,0x38,0x39,0x3a,0x3b,0x3c,0x3d,0x3e,0x3f,
0x40,0x61,0x62,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0x6a,0x6b,0x6c,0x6d,0x6e,0x6f,
0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7a,0x5b,0x5c,0x5d,0x5e,0x5f,
0x60,0x61,0x62,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0x6a,0x6b,0x6c,0x6d,0x6e,0x6f,
0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7a,0x7b,0x7c,0x7d,0x7e,0x7f,
0x80,0x81,0x82,0x83,0x84,0x85,0x86,0x87,0x88,0x89,0x8a,0x8b,0x8c,0x8d,0x8e,0x8f,
0x90,0x91,0x92,0x93,0x94,0x95,0x96,0x97,0x98,0x99,0x9a,0x9b,0x9c,0x9d,0x9e,0x9f,
0xa0,0xa1,0xa2,0xa3,0xa4,0xa5,0xa6,0xa7,0xa8,0xa9,0xaa,0xab,0xac,0xad,0xae,0xaf,
0xb0,0xb1,0xb2,0xb3,0xb4,0xb5,0xb6,0xb7,0xb8,0xb9,0xba,0xbb,0xbc,0xbd,0xbe,0xbf,
0xe0,0xe1,0xe2,0xe3,0xe4,0xe5,0xe6,0xe7,0xe8,0xe9,0xea,0xeb,0xec,0xed,0xee,0xef,
0xf0,0xf1,0xf2,0xf3,0xf4,0xf5,0xf6,0xd7,0xf8,0xf9,0xfa,0xfb,0xfc,0xfd,0xfe,0xdf,
0xe0,0xe1,0xe2,0xe3,0xe4,0xe5,0xe6,0xe7,0xe8,0xe9,0xea,0xeb,0xec,0xed,0xee,0xef,
0xf0,0xf1,0xf2,0xf3,0xf4,0xf5,0xf6,0xf7,0xf8,0xf9,0xfa,0xfb,0xfc,0xfd,0xfe,0xff
};
int qFindByteArray( int qFindByteArray(
const char *haystack0, int haystackLen, int from, const char *haystack0, int haystackLen, int from,
@ -293,8 +245,8 @@ int qstrcmp(const char *str1, const char *str2)
A safe \c stricmp() function. A safe \c stricmp() function.
Compares \a str1 and \a str2 ignoring the case of the Compares \a str1 and \a str2, ignoring differences in the case of any ASCII
characters. The encoding of the strings is assumed to be Latin-1. characters.
Returns a negative value if \a str1 is less than \a str2, 0 if \a Returns a negative value if \a str1 is less than \a str2, 0 if \a
str1 is equal to \a str2 or a positive value if \a str1 is greater str1 is equal to \a str2 or a positive value if \a str1 is greater
@ -323,11 +275,10 @@ int qstricmp(const char *str1, const char *str2)
auto innerCompare = [=, &offset](qptrdiff max, bool unlimited) { auto innerCompare = [=, &offset](qptrdiff max, bool unlimited) {
max += offset; max += offset;
do { do {
uchar c = latin1_lowercased[s1[offset]]; uchar c = s1[offset];
int res = c - latin1_lowercased[s2[offset]]; if (int res = asciiLower(c) - asciiLower(s2[offset]))
if (Q_UNLIKELY(res))
return res; return res;
if (Q_UNLIKELY(!c)) if (!c)
return 0; return 0;
++offset; ++offset;
} while (unlimited || offset < max); } while (unlimited || offset < max);
@ -385,9 +336,8 @@ int qstricmp(const char *str1, const char *str2)
A safe \c strnicmp() function. A safe \c strnicmp() function.
Compares at most \a len bytes of \a str1 and \a str2 ignoring the Compares at most \a len bytes of \a str1 and \a str2, ignoring differences
case of the characters. The encoding of the strings is assumed to in the case of any ASCII characters.
be Latin-1.
Returns a negative value if \a str1 is less than \a str2, 0 if \a str1 Returns a negative value if \a str1 is less than \a str2, 0 if \a str1
is equal to \a str2 or a positive value if \a str1 is greater than \a is equal to \a str2 or a positive value if \a str1 is greater than \a
@ -406,12 +356,11 @@ int qstrnicmp(const char *str1, const char *str2, uint len)
{ {
const uchar *s1 = reinterpret_cast<const uchar *>(str1); const uchar *s1 = reinterpret_cast<const uchar *>(str1);
const uchar *s2 = reinterpret_cast<const uchar *>(str2); const uchar *s2 = reinterpret_cast<const uchar *>(str2);
int res;
uchar c;
if (!s1 || !s2) if (!s1 || !s2)
return s1 ? 1 : (s2 ? -1 : 0); return s1 ? 1 : (s2 ? -1 : 0);
for (; len--; s1++, s2++) { for (; len--; ++s1, ++s2) {
if ((res = (c = latin1_lowercased[*s1]) - latin1_lowercased[*s2])) const uchar c = *s1;
if (int res = asciiLower(c) - asciiLower(*s2))
return res; return res;
if (!c) // strings are equal if (!c) // strings are equal
break; break;
@ -437,28 +386,23 @@ int qstrnicmp(const char *str1, qsizetype len1, const char *str2, qsizetype len2
if (!s2) if (!s2)
return len1 == 0 ? 0 : 1; return len1 == 0 ? 0 : 1;
int res;
uchar c;
if (len2 == -1) { if (len2 == -1) {
// null-terminated str2 // null-terminated str2
qsizetype i; qsizetype i;
for (i = 0; i < len1; ++i) { for (i = 0; i < len1; ++i) {
c = latin1_lowercased[s2[i]]; const uchar c = s2[i];
if (!c) if (!c)
return 1; return 1;
res = latin1_lowercased[s1[i]] - c; if (int res = asciiLower(s1[i]) - asciiLower(c))
if (res)
return res; return res;
} }
c = latin1_lowercased[s2[i]]; return s2[i] ? -1 : 0;
return c ? -1 : 0;
} else { } else {
// not null-terminated // not null-terminated
for (qsizetype i = 0; i < qMin(len1, len2); ++i) { const qsizetype len = qMin(len1, len2);
c = latin1_lowercased[s2[i]]; for (qsizetype i = 0; i < len; ++i) {
res = latin1_lowercased[s1[i]] - c; if (int res = asciiLower(s1[i]) - asciiLower(s2[i]))
if (res)
return res; return res;
} }
if (len1 == len2) if (len1 == len2)
@ -786,14 +730,14 @@ QByteArray qUncompress(const uchar* data, int nbytes)
terminator, and uses \l{implicit sharing} (copy-on-write) to terminator, and uses \l{implicit sharing} (copy-on-write) to
reduce memory usage and avoid needless copying of data. reduce memory usage and avoid needless copying of data.
In addition to QByteArray, Qt also provides the QString class to In addition to QByteArray, Qt also provides the QString class to store
store string data. For most purposes, QString is the class you string data. For most purposes, QString is the class you want to use. It
want to use. It stores 16-bit Unicode characters, making it easy understands its content as Unicode text (encoded using UTF-16) where
to store non-ASCII/non-Latin-1 characters in your application. QByteArray aims to avoid assumptions about the encoding or semantics of the
Furthermore, QString is used throughout in the Qt API. The two bytes it stores (aside from a few legacy cases where it uses ASCII).
main cases where QByteArray is appropriate are when you need to Furthermore, QString is used throughout in the Qt API. The two main cases
store raw binary data, and when memory conservation is critical where QByteArray is appropriate are when you need to store raw binary data,
(e.g., with Qt for Embedded Linux). and when memory conservation is critical (e.g., with Qt for Embedded Linux).
One way to initialize a QByteArray is simply to pass a \c{const One way to initialize a QByteArray is simply to pass a \c{const
char *} to its constructor. For example, the following code char *} to its constructor. For example, the following code
@ -868,13 +812,6 @@ QByteArray qUncompress(const uchar* data, int nbytes)
memory QByteArray actually allocated. Data appended to an empty memory QByteArray actually allocated. Data appended to an empty
array is not copied. array is not copied.
A frequent requirement is to remove whitespace characters from a
byte array ('\\n', '\\t', ' ', etc.). If you want to remove
whitespace from both ends of a QByteArray, use trimmed(). If you
want to remove whitespace from both ends and replace multiple
consecutive whitespaces with a single space character within the
byte array, use simplified().
If you want to find all occurrences of a particular character or If you want to find all occurrences of a particular character or
substring in a QByteArray, use indexOf() or lastIndexOf(). The substring in a QByteArray, use indexOf() or lastIndexOf(). The
former searches forward starting from a given index position, the former searches forward starting from a given index position, the
@ -932,29 +869,40 @@ QByteArray qUncompress(const uchar* data, int nbytes)
Such considerations, the configuration of such behavior or any mitigation Such considerations, the configuration of such behavior or any mitigation
are outside the scope of the QByteArray API. are outside the scope of the QByteArray API.
\section1 Notes on Locale \section1 C locale and ASCII functions
QByteArray generally handles data as bytes, without presuming any semantics;
where it does presume semantics, it uses the C locale and ASCII encoding.
Standard Unicode encodings are supported by QString, other encodings may be
supported using QStringEncoder and QStringDecoder to convert to Unicode. For
locale-specific interpretation of text, use QLocale or QString.
\section2 Spacing Characters
A frequent requirement is to remove spacing characters from a byte array
('\\n', '\\t', ' ', etc.). If you want to remove spacing from both ends of a
QByteArray, use trimmed(). If you want to remove spacing from both ends and
replace each run of spacing characters with a single space character within
the byte array, use simplified(). Only ASCII spacing characters are
recognized for these purposes.
\section2 Number-String Conversions \section2 Number-String Conversions
Functions that perform conversions between numeric data types and Functions that perform conversions between numeric data types and strings
strings are performed in the C locale, irrespective of the user's are performed in the C locale, regardless of the user's locale settings. Use
locale settings. Use QString to perform locale-aware conversions QLocale to perform locale-aware conversions between numbers and strings.
between numbers and strings.
\section2 8-bit Character Comparisons \section2 Character Case
In QByteArray, the notion of uppercase and lowercase and of which In QByteArray, the notion of uppercase and lowercase and of case-independent
character is greater than or less than another character is done comparison is limited to ASCII. Non-ASCII characters are treated as
in the Latin-1 locale. This affects functions that support a case caseless, since their case depends on encoding. This affects functions that
insensitive option or that compare or lowercase or uppercase support a case insensitive option or that change the case of their
their arguments. Case insensitive operations and comparisons will arguments. Functions that this affects include contains(), indexOf(),
be accurate if both strings contain only Latin-1 characters. lastIndexOf(), isLower(), isUpper(), toLower() and toUpper().
Functions that this affects include contains(), indexOf(),
lastIndexOf(), operator<(), operator<=(), operator>(),
operator>=(), isLower(), isUpper(), toLower() and toUpper().
This issue does not apply to \l{QString}s since they represent This issue does not apply to \l{QString}s since they represent characters
characters using Unicode. using Unicode.
\sa QString, QBitArray \sa QString, QBitArray
*/ */
@ -2899,22 +2847,16 @@ bool QByteArray::endsWith(const char *str) const
} }
/* /*
Returns true if \a c is an uppercase Latin1 letter. Returns true if \a c is an uppercase ASCII letter.
\note The multiplication sign 0xD7 and the sz ligature 0xDF are not
treated as uppercase Latin1.
*/ */
static inline bool isUpperCaseLatin1(char c) static constexpr inline bool isUpperCaseAscii(char c)
{ {
if (c >= 'A' && c <= 'Z') return c >= 'A' && c <= 'Z';
return true;
return (uchar(c) >= 0xC0 && uchar(c) <= 0xDE && uchar(c) != 0xD7);
} }
/*! /*!
Returns \c true if this byte array contains only uppercase letters, Returns \c true if this byte array contains only ASCII uppercase letters,
otherwise returns \c false. The byte array is interpreted as a Latin-1 otherwise returns \c false.
encoded string.
\since 5.12 \since 5.12
\sa isLower(), toUpper() \sa isLower(), toUpper()
@ -2927,7 +2869,7 @@ bool QByteArray::isUpper() const
const char *d = data(); const char *d = data();
for (int i = 0, max = size(); i < max; ++i) { for (int i = 0, max = size(); i < max; ++i) {
if (!isUpperCaseLatin1(d[i])) if (!isUpperCaseAscii(d[i]))
return false; return false;
} }
@ -2935,22 +2877,16 @@ bool QByteArray::isUpper() const
} }
/* /*
Returns true if \a c is an lowercase Latin1 letter. Returns true if \a c is an lowercase ASCII letter.
\note The division sign 0xF7 is not treated as lowercase Latin1,
but the small y dieresis 0xFF is.
*/ */
static inline bool isLowerCaseLatin1(char c) static constexpr inline bool isLowerCaseAscii(char c)
{ {
if (c >= 'a' && c <= 'z') return c >= 'a' && c <= 'z';
return true;
return (uchar(c) >= 0xD0 && uchar(c) != 0xF7);
} }
/*! /*!
Returns \c true if this byte array contains only lowercase letters, Returns \c true if this byte array contains only lowercase ASCII letters,
otherwise returns \c false. The byte array is interpreted as a Latin-1 otherwise returns \c false.
encoded string.
\since 5.12 \since 5.12
\sa isUpper(), toLower() \sa isUpper(), toLower()
@ -2963,7 +2899,7 @@ bool QByteArray::isLower() const
const char *d = data(); const char *d = data();
for (int i = 0, max = size(); i < max; ++i) { for (int i = 0, max = size(); i < max; ++i) {
if (!isLowerCaseLatin1(d[i])) if (!isLowerCaseAscii(d[i]))
return false; return false;
} }
@ -3076,8 +3012,8 @@ QByteArray QByteArray::mid(int pos, int len) const
/*! /*!
\fn QByteArray QByteArray::toLower() const \fn QByteArray QByteArray::toLower() const
Returns a lowercase copy of the byte array. The bytearray is Returns a copy of the byte array in which each ASCII uppercase letter
interpreted as a Latin-1 encoded string. converted to lowercase.
Example: Example:
\snippet code/src_corelib_text_qbytearray.cpp 30 \snippet code/src_corelib_text_qbytearray.cpp 30
@ -3090,7 +3026,7 @@ QByteArray QByteArray::mid(int pos, int len) const
// (even with constant propagation, there's no gain in performance). // (even with constant propagation, there's no gain in performance).
template <typename T> template <typename T>
Q_NEVER_INLINE Q_NEVER_INLINE
static QByteArray toCase_template(T &input, const uchar * table) static QByteArray toCase_template(T &input, uchar (*lookup)(uchar))
{ {
// find the first bad character in input // find the first bad character in input
const char *orig_begin = input.constBegin(); const char *orig_begin = input.constBegin();
@ -3098,7 +3034,7 @@ static QByteArray toCase_template(T &input, const uchar * table)
const char *e = input.constEnd(); const char *e = input.constEnd();
for ( ; firstBad != e ; ++firstBad) { for ( ; firstBad != e ; ++firstBad) {
uchar ch = uchar(*firstBad); uchar ch = uchar(*firstBad);
uchar converted = table[ch]; uchar converted = lookup(ch);
if (ch != converted) if (ch != converted)
break; break;
} }
@ -3111,27 +3047,26 @@ static QByteArray toCase_template(T &input, const uchar * table)
char *b = s.begin(); // will detach if necessary char *b = s.begin(); // will detach if necessary
char *p = b + (firstBad - orig_begin); char *p = b + (firstBad - orig_begin);
e = b + s.size(); e = b + s.size();
for ( ; p != e; ++p) { for ( ; p != e; ++p)
*p = char(uchar(table[uchar(*p)])); *p = char(lookup(uchar(*p)));
}
return s; return s;
} }
QByteArray QByteArray::toLower_helper(const QByteArray &a) QByteArray QByteArray::toLower_helper(const QByteArray &a)
{ {
return toCase_template(a, latin1_lowercased); return toCase_template(a, asciiLower);
} }
QByteArray QByteArray::toLower_helper(QByteArray &a) QByteArray QByteArray::toLower_helper(QByteArray &a)
{ {
return toCase_template(a, latin1_lowercased); return toCase_template(a, asciiLower);
} }
/*! /*!
\fn QByteArray QByteArray::toUpper() const \fn QByteArray QByteArray::toUpper() const
Returns an uppercase copy of the byte array. The bytearray is Returns a copy of the byte array in which each ASCII lowercase letter
interpreted as a Latin-1 encoded string. converted to uppercase.
Example: Example:
\snippet code/src_corelib_text_qbytearray.cpp 31 \snippet code/src_corelib_text_qbytearray.cpp 31
@ -3141,12 +3076,12 @@ QByteArray QByteArray::toLower_helper(QByteArray &a)
QByteArray QByteArray::toUpper_helper(const QByteArray &a) QByteArray QByteArray::toUpper_helper(const QByteArray &a)
{ {
return toCase_template(a, latin1_uppercased); return toCase_template(a, asciiUpper);
} }
QByteArray QByteArray::toUpper_helper(QByteArray &a) QByteArray QByteArray::toUpper_helper(QByteArray &a)
{ {
return toCase_template(a, latin1_uppercased); return toCase_template(a, asciiUpper);
} }
/*! \fn void QByteArray::clear() /*! \fn void QByteArray::clear()
@ -4226,7 +4161,7 @@ QByteArray &QByteArray::setNum(double n, char f, int prec)
QLocaleData::DoubleForm form = QLocaleData::DFDecimal; QLocaleData::DoubleForm form = QLocaleData::DFDecimal;
uint flags = QLocaleData::ZeroPadExponent; uint flags = QLocaleData::ZeroPadExponent;
char lower = latin1_lowercased[uchar(f)]; char lower = asciiLower(uchar(f));
if (f != lower) if (f != lower)
flags |= QLocaleData::CapitalEorX; flags |= QLocaleData::CapitalEorX;
f = lower; f = lower;
@ -4248,7 +4183,7 @@ QByteArray &QByteArray::setNum(double n, char f, int prec)
break; break;
} }
*this = QLocaleData::c()->doubleToString(n, prec, form, -1, flags).toLatin1(); *this = QLocaleData::c()->doubleToString(n, prec, form, -1, flags).toUtf8();
return *this; return *this;
} }

View File

@ -1192,6 +1192,46 @@ static int ucstrcmp(const QChar *a, size_t alen, const char *b, size_t blen)
return cmp ? cmp : lencmp(alen, blen); return cmp ? cmp : lencmp(alen, blen);
} }
static int latin1nicmp(const char *lhsChar, int lSize, const char *rhsChar, int rSize)
{
constexpr uchar latin1Lower[256] = {
0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,
0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,0x18,0x19,0x1a,0x1b,0x1c,0x1d,0x1e,0x1f,
0x20,0x21,0x22,0x23,0x24,0x25,0x26,0x27,0x28,0x29,0x2a,0x2b,0x2c,0x2d,0x2e,0x2f,
0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,0x38,0x39,0x3a,0x3b,0x3c,0x3d,0x3e,0x3f,
0x40,0x61,0x62,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0x6a,0x6b,0x6c,0x6d,0x6e,0x6f,
0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7a,0x5b,0x5c,0x5d,0x5e,0x5f,
0x60,0x61,0x62,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0x6a,0x6b,0x6c,0x6d,0x6e,0x6f,
0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7a,0x7b,0x7c,0x7d,0x7e,0x7f,
0x80,0x81,0x82,0x83,0x84,0x85,0x86,0x87,0x88,0x89,0x8a,0x8b,0x8c,0x8d,0x8e,0x8f,
0x90,0x91,0x92,0x93,0x94,0x95,0x96,0x97,0x98,0x99,0x9a,0x9b,0x9c,0x9d,0x9e,0x9f,
0xa0,0xa1,0xa2,0xa3,0xa4,0xa5,0xa6,0xa7,0xa8,0xa9,0xaa,0xab,0xac,0xad,0xae,0xaf,
0xb0,0xb1,0xb2,0xb3,0xb4,0xb5,0xb6,0xb7,0xb8,0xb9,0xba,0xbb,0xbc,0xbd,0xbe,0xbf,
// 0xd7 (multiplication sign) and 0xdf (sz ligature) complicate life
0xe0,0xe1,0xe2,0xe3,0xe4,0xe5,0xe6,0xe7,0xe8,0xe9,0xea,0xeb,0xec,0xed,0xee,0xef,
0xf0,0xf1,0xf2,0xf3,0xf4,0xf5,0xf6,0xd7,0xf8,0xf9,0xfa,0xfb,0xfc,0xfd,0xfe,0xdf,
0xe0,0xe1,0xe2,0xe3,0xe4,0xe5,0xe6,0xe7,0xe8,0xe9,0xea,0xeb,0xec,0xed,0xee,0xef,
0xf0,0xf1,0xf2,0xf3,0xf4,0xf5,0xf6,0xf7,0xf8,0xf9,0xfa,0xfb,0xfc,0xfd,0xfe,0xff
};
// We're called with QLatin1String's .data() and .size():
Q_ASSERT(lSize >= 0 && rSize >= 0);
if (!lSize)
return rSize ? -1 : 0;
if (!rSize)
return 1;
const int size = std::min(lSize, rSize);
const uchar *lhs = reinterpret_cast<const uchar *>(lhsChar);
const uchar *rhs = reinterpret_cast<const uchar *>(rhsChar);
Q_ASSERT(lhs && rhs); // since both lSize and rSize are positive
for (int i = 0; i < size; i++) {
Q_ASSERT(lhs[i] && rhs[i]);
if (int res = latin1Lower[lhs[i]] - latin1Lower[rhs[i]])
return res;
}
return lencmp(lSize, rSize);
}
static int qt_compare_strings(QStringView lhs, QStringView rhs, Qt::CaseSensitivity cs) noexcept static int qt_compare_strings(QStringView lhs, QStringView rhs, Qt::CaseSensitivity cs) noexcept
{ {
if (cs == Qt::CaseSensitive) if (cs == Qt::CaseSensitive)
@ -1218,7 +1258,7 @@ static int qt_compare_strings(QLatin1String lhs, QLatin1String rhs, Qt::CaseSens
if (lhs.isEmpty()) if (lhs.isEmpty())
return lencmp(0, rhs.size()); return lencmp(0, rhs.size());
if (cs == Qt::CaseInsensitive) if (cs == Qt::CaseInsensitive)
return qstrnicmp(lhs.data(), lhs.size(), rhs.data(), rhs.size()); return latin1nicmp(lhs.data(), lhs.size(), rhs.data(), rhs.size());
const auto l = std::min(lhs.size(), rhs.size()); const auto l = std::min(lhs.size(), rhs.size());
int r = qstrncmp(lhs.data(), rhs.data(), l); int r = qstrncmp(lhs.data(), rhs.data(), l);
return r ? r : lencmp(lhs.size(), rhs.size()); return r ? r : lencmp(lhs.size(), rhs.size());

View File

@ -176,9 +176,9 @@ QByteArray verifyZeroTermination(const QByteArray &ba)
int baSize = ba.size(); int baSize = ba.size();
char baTerminator = ba.constData()[baSize]; char baTerminator = ba.constData()[baSize];
if ('\0' != baTerminator) if ('\0' != baTerminator)
return QString::fromLatin1( return QString::fromUtf8(
"*** Result ('%1') not null-terminated: 0x%2 ***").arg(QString::fromLatin1(ba)) "*** Result ('%1') not null-terminated: 0x%2 ***").arg(QString::fromUtf8(ba))
.arg(baTerminator, 2, 16, QChar('0')).toLatin1(); .arg(baTerminator, 2, 16, QChar('0')).toUtf8();
// Skip mutating checks on shared strings // Skip mutating checks on shared strings
if (baDataPtr->isShared()) if (baDataPtr->isShared())
@ -934,30 +934,30 @@ void tst_QByteArray::qstricmp()
QFETCH(QString, str1); QFETCH(QString, str1);
QFETCH(QString, str2); QFETCH(QString, str2);
int expected = strcmp(str1.toUpper().toLatin1(), int expected = strcmp(str1.toUpper().toUtf8(),
str2.toUpper().toLatin1()); str2.toUpper().toUtf8());
if ( expected != 0 ) { if ( expected != 0 ) {
expected = (expected < 0 ? -1 : 1); expected = (expected < 0 ? -1 : 1);
} }
int actual = ::qstricmp(str1.toLatin1(), str2.toLatin1()); int actual = ::qstricmp(str1.toUtf8(), str2.toUtf8());
if ( actual != 0 ) { if ( actual != 0 ) {
actual = (actual < 0 ? -1 : 1); actual = (actual < 0 ? -1 : 1);
} }
QCOMPARE(actual, expected); QCOMPARE(actual, expected);
actual = ::qstricmp("012345679abcd" + str1.toLatin1(), "012345679AbCd" + str2.toLatin1()); actual = ::qstricmp("012345679abcd" + str1.toUtf8(), "012345679AbCd" + str2.toUtf8());
if ( actual != 0 ) { if ( actual != 0 ) {
actual = (actual < 0 ? -1 : 1); actual = (actual < 0 ? -1 : 1);
} }
QCOMPARE(actual, expected); QCOMPARE(actual, expected);
actual = str1.toLatin1().compare(str2.toLatin1(), Qt::CaseInsensitive); actual = str1.toUtf8().compare(str2.toUtf8(), Qt::CaseInsensitive);
if ( actual != 0 ) { if ( actual != 0 ) {
actual = (actual < 0 ? -1 : 1); actual = (actual < 0 ? -1 : 1);
} }
QCOMPARE(actual, expected); QCOMPARE(actual, expected);
actual = str1.toLatin1().compare(str2.toLatin1().constData(), Qt::CaseInsensitive); actual = str1.toUtf8().compare(str2.toUtf8().constData(), Qt::CaseInsensitive);
if ( actual != 0 ) { if ( actual != 0 ) {
actual = (actual < 0 ? -1 : 1); actual = (actual < 0 ? -1 : 1);
} }
@ -1468,7 +1468,7 @@ void tst_QByteArray::toULong_data()
QTest::addColumn<bool>("ok"); QTest::addColumn<bool>("ok");
ulong LongMaxPlusOne = (ulong)LONG_MAX + 1; ulong LongMaxPlusOne = (ulong)LONG_MAX + 1;
QTest::newRow("LONG_MAX+1") << QString::number(LongMaxPlusOne).toLatin1() << 10 << LongMaxPlusOne << true; QTest::newRow("LONG_MAX+1") << QString::number(LongMaxPlusOne).toUtf8() << 10 << LongMaxPlusOne << true;
QTest::newRow("default") << QByteArray() << 10 << 0UL << false; QTest::newRow("default") << QByteArray() << 10 << 0UL << false;
QTest::newRow("empty") << QByteArray("") << 10 << 0UL << false; QTest::newRow("empty") << QByteArray("") << 10 << 0UL << false;
QTest::newRow("ulong1") << QByteArray("3234567890") << 10 << 3234567890UL << true; QTest::newRow("ulong1") << QByteArray("3234567890") << 10 << 3234567890UL << true;
@ -1990,7 +1990,7 @@ void tst_QByteArray::compareCharStar()
const bool isEqual = result == 0; const bool isEqual = result == 0;
const bool isLess = result < 0; const bool isLess = result < 0;
const bool isGreater = result > 0; const bool isGreater = result > 0;
QByteArray qba = string2.toLatin1(); QByteArray qba = string2.toUtf8();
const char *str2 = qba.constData(); const char *str2 = qba.constData();
if (string2.isNull()) if (string2.isNull())
str2 = 0; str2 = 0;
@ -2297,6 +2297,14 @@ void tst_QByteArray::toUpperLower_data()
QTest::addColumn<QByteArray>("upper"); QTest::addColumn<QByteArray>("upper");
QTest::addColumn<QByteArray>("lower"); QTest::addColumn<QByteArray>("lower");
{
QByteArray nonAscii(128, Qt::Uninitialized);
char *data = nonAscii.data();
for (unsigned char i = 0; i < 128; ++i)
data[i] = i + 128;
QTest::newRow("non-ASCII") << nonAscii << nonAscii << nonAscii;
}
QTest::newRow("empty") << QByteArray() << QByteArray() << QByteArray(); QTest::newRow("empty") << QByteArray() << QByteArray() << QByteArray();
QTest::newRow("literal") << QByteArrayLiteral("Hello World") QTest::newRow("literal") << QByteArrayLiteral("Hello World")
<< QByteArrayLiteral("HELLO WORLD") << QByteArrayLiteral("HELLO WORLD")
@ -2304,9 +2312,6 @@ void tst_QByteArray::toUpperLower_data()
QTest::newRow("ascii") << QByteArray("Hello World, this is a STRING") QTest::newRow("ascii") << QByteArray("Hello World, this is a STRING")
<< QByteArray("HELLO WORLD, THIS IS A STRING") << QByteArray("HELLO WORLD, THIS IS A STRING")
<< QByteArray("hello world, this is a string"); << QByteArray("hello world, this is a string");
QTest::newRow("latin1") << QByteArray("R\311sum\351")
<< QByteArray("R\311SUM\311")
<< QByteArray("r\351sum\351");
QTest::newRow("nul") << QByteArray("a\0B", 3) << QByteArray("A\0B", 3) << QByteArray("a\0b", 3); QTest::newRow("nul") << QByteArray("a\0B", 3) << QByteArray("A\0B", 3) << QByteArray("a\0b", 3);
} }
@ -2350,9 +2355,9 @@ void tst_QByteArray::isUpper()
QVERIFY(!QByteArray().isUpper()); QVERIFY(!QByteArray().isUpper());
QVERIFY(!QByteArray("").isUpper()); QVERIFY(!QByteArray("").isUpper());
QVERIFY(QByteArray("TEXT").isUpper()); QVERIFY(QByteArray("TEXT").isUpper());
QVERIFY(QByteArray("\xD0\xDE").isUpper()); QVERIFY(!QByteArray("\xD0\xDE").isUpper()); // non-ASCII is neither upper nor lower
QVERIFY(!QByteArray("\xD7").isUpper()); // multiplication sign is not upper QVERIFY(!QByteArray("\xD7").isUpper());
QVERIFY(!QByteArray("\xDF").isUpper()); // sz ligature is not upper QVERIFY(!QByteArray("\xDF").isUpper());
QVERIFY(!QByteArray("text").isUpper()); QVERIFY(!QByteArray("text").isUpper());
QVERIFY(!QByteArray("Text").isUpper()); QVERIFY(!QByteArray("Text").isUpper());
QVERIFY(!QByteArray("tExt").isUpper()); QVERIFY(!QByteArray("tExt").isUpper());
@ -2373,8 +2378,8 @@ void tst_QByteArray::isLower()
QVERIFY(!QByteArray().isLower()); QVERIFY(!QByteArray().isLower());
QVERIFY(!QByteArray("").isLower()); QVERIFY(!QByteArray("").isLower());
QVERIFY(QByteArray("text").isLower()); QVERIFY(QByteArray("text").isLower());
QVERIFY(QByteArray("\xE0\xFF").isLower()); QVERIFY(!QByteArray("\xE0\xFF").isLower()); // non-ASCII is neither upper nor lower
QVERIFY(!QByteArray("\xF7").isLower()); // division sign is not lower QVERIFY(!QByteArray("\xF7").isLower());
QVERIFY(!QByteArray("Text").isLower()); QVERIFY(!QByteArray("Text").isLower());
QVERIFY(!QByteArray("tExt").isLower()); QVERIFY(!QByteArray("tExt").isLower());
QVERIFY(!QByteArray("teXt").isLower()); QVERIFY(!QByteArray("teXt").isLower());
@ -2416,7 +2421,6 @@ void tst_QByteArray::stdString()
QVERIFY(l1str.length() < utf8str.length()); QVERIFY(l1str.length() < utf8str.length());
} }
const char globalChar = '1'; const char globalChar = '1';
QTEST_MAIN(tst_QByteArray) QTEST_MAIN(tst_QByteArray)

View File

@ -972,7 +972,7 @@ void tst_QStringApiSymmetry::compare_data(bool hasConceptOfNullAndEmpty)
<< 0 << 0; << 0 << 0;
} }
#define ROW(lhs, rhs) \ #define ROW(lhs, rhs, caseless) \
do { \ do { \
static const QString pinned[] = { \ static const QString pinned[] = { \
QString(QLatin1String(lhs)), \ QString(QLatin1String(lhs)), \
@ -981,16 +981,19 @@ void tst_QStringApiSymmetry::compare_data(bool hasConceptOfNullAndEmpty)
QTest::newRow(qUtf8Printable(QLatin1String("'" lhs "' <> '" rhs "': "))) \ QTest::newRow(qUtf8Printable(QLatin1String("'" lhs "' <> '" rhs "': "))) \
<< QStringRef(&pinned[0]) << QLatin1String(lhs) \ << QStringRef(&pinned[0]) << QLatin1String(lhs) \
<< QStringRef(&pinned[1]) << QLatin1String(rhs) \ << QStringRef(&pinned[1]) << QLatin1String(rhs) \
<< sign(qstrcmp(lhs, rhs)) << sign(qstricmp(lhs, rhs)); \ << sign(qstrcmp(lhs, rhs)) << caseless; \
} while (false) } while (false)
ROW("", "0"); #define ASCIIROW(lhs, rhs) ROW(lhs, rhs, sign(qstricmp(lhs, rhs)))
ROW("0", ""); ASCIIROW("", "0");
ROW("0", "1"); ASCIIROW("0", "");
ROW("0", "0"); ASCIIROW("0", "1");
ROW("10", "0"); ASCIIROW("0", "0");
ROW("01", "1"); ASCIIROW("10", "0");
ROW("\xE4", "\xE4"); // ä <> ä ASCIIROW("01", "1");
ROW("\xE4", "\xC4"); // ä <> Ä ASCIIROW("e", "e");
ASCIIROW("e", "E");
ROW("\xE4", "\xE4", 0); // ä <> ä
ROW("\xE4", "\xC4", 0); // ä <> Ä
#undef ROW #undef ROW
} }