Restore handling of BOMs in QString::fromUtf8
8dd47e34b9b96ac27a99cdcf10b8aec506882fc2 removed the handling of the BOMs but did not document it. This brings the behavior back and adds a unit test so we don't break it again. Discussed-on: http://lists.qt-project.org/pipermail/development/2014-April/016532.html Change-Id: Ifb7a9a6e5a494622f46b8ab435e1d168b862d952 Reviewed-by: Olivier Goffart <ogoffart@woboq.com> Reviewed-by: Lars Knoll <lars.knoll@digia.com>
This commit is contained in:
parent
cae970c686
commit
f56ef579ba
@ -52,6 +52,8 @@ QT_BEGIN_NAMESPACE
|
|||||||
|
|
||||||
enum { Endian = 0, Data = 1 };
|
enum { Endian = 0, Data = 1 };
|
||||||
|
|
||||||
|
static const uchar utf8bom[] = { 0xef, 0xbb, 0xbf };
|
||||||
|
|
||||||
#if defined(__SSE2__) && defined(QT_COMPILER_SUPPORTS_SSE2)
|
#if defined(__SSE2__) && defined(QT_COMPILER_SUPPORTS_SSE2)
|
||||||
static inline bool simdEncodeAscii(uchar *&dst, const ushort *&nextAscii, const ushort *&src, const ushort *end)
|
static inline bool simdEncodeAscii(uchar *&dst, const ushort *&nextAscii, const ushort *&src, const ushort *end)
|
||||||
{
|
{
|
||||||
@ -187,9 +189,9 @@ QByteArray QUtf8::convertFromUnicode(const QChar *uc, int len, QTextCodec::Conve
|
|||||||
int invalid = 0;
|
int invalid = 0;
|
||||||
if (state && !(state->flags & QTextCodec::IgnoreHeader)) {
|
if (state && !(state->flags & QTextCodec::IgnoreHeader)) {
|
||||||
// append UTF-8 BOM
|
// append UTF-8 BOM
|
||||||
*cursor++ = 0xef;
|
*cursor++ = utf8bom[0];
|
||||||
*cursor++ = 0xbb;
|
*cursor++ = utf8bom[1];
|
||||||
*cursor++ = 0xbf;
|
*cursor++ = utf8bom[2];
|
||||||
}
|
}
|
||||||
|
|
||||||
const ushort *nextAscii = src;
|
const ushort *nextAscii = src;
|
||||||
@ -240,8 +242,19 @@ QString QUtf8::convertToUnicode(const char *chars, int len)
|
|||||||
const uchar *src = reinterpret_cast<const uchar *>(chars);
|
const uchar *src = reinterpret_cast<const uchar *>(chars);
|
||||||
const uchar *end = src + len;
|
const uchar *end = src + len;
|
||||||
|
|
||||||
while (src < end) {
|
// attempt to do a full decoding in SIMD
|
||||||
const uchar *nextAscii = end;
|
const uchar *nextAscii = end;
|
||||||
|
if (!simdDecodeAscii(dst, nextAscii, src, end)) {
|
||||||
|
// at least one non-ASCII entry
|
||||||
|
// check if we failed to decode the UTF-8 BOM; if so, skip it
|
||||||
|
if (Q_UNLIKELY(src == reinterpret_cast<const uchar *>(chars))
|
||||||
|
&& end - src >= 3
|
||||||
|
&& Q_UNLIKELY(src[0] == utf8bom[0] && src[1] == utf8bom[1] && src[2] == utf8bom[2])) {
|
||||||
|
src += 3;
|
||||||
|
}
|
||||||
|
|
||||||
|
while (src < end) {
|
||||||
|
nextAscii = end;
|
||||||
if (simdDecodeAscii(dst, nextAscii, src, end))
|
if (simdDecodeAscii(dst, nextAscii, src, end))
|
||||||
break;
|
break;
|
||||||
|
|
||||||
@ -254,6 +267,7 @@ QString QUtf8::convertToUnicode(const char *chars, int len)
|
|||||||
}
|
}
|
||||||
} while (src < nextAscii);
|
} while (src < nextAscii);
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
result.truncate(dst - reinterpret_cast<const ushort *>(result.constData()));
|
result.truncate(dst - reinterpret_cast<const ushort *>(result.constData()));
|
||||||
return result;
|
return result;
|
||||||
|
@ -3619,6 +3619,14 @@ void tst_QString::fromUtf8_data()
|
|||||||
str += " some ";
|
str += " some ";
|
||||||
QTest::newRow("str3-len") << QByteArray("\342\202\254 some text") << str << 9;
|
QTest::newRow("str3-len") << QByteArray("\342\202\254 some text") << str << 9;
|
||||||
|
|
||||||
|
// test that QString::fromUtf8 suppresses an initial BOM, but not a ZWNBSP
|
||||||
|
str = "hello";
|
||||||
|
QByteArray bom("\357\273\277");
|
||||||
|
QTest::newRow("bom0") << bom << QString() << 3;
|
||||||
|
QTest::newRow("bom1") << bom + "hello" << str << -1;
|
||||||
|
QTest::newRow("bom+zwnbsp0") << bom + bom << QString(QChar(0xfeff)) << -1;
|
||||||
|
QTest::newRow("bom+zwnbsp1") << bom + "hello" + bom << str + QChar(0xfeff) << -1;
|
||||||
|
|
||||||
str = "hello";
|
str = "hello";
|
||||||
str += QChar::ReplacementCharacter;
|
str += QChar::ReplacementCharacter;
|
||||||
str += QChar(0x68);
|
str += QChar(0x68);
|
||||||
|
Loading…
x
Reference in New Issue
Block a user