This is semantic patch using ClangTidyTransformator: auto QtContainerClass = expr(hasType(namedDecl(hasAnyName(<classes>)))).bind(o) makeRule(cxxMemberCallExpr(on(QtContainerClass), callee(cxxMethodDecl(hasAnyName({"count", "length"), parameterCountIs(0))))), changeTo(cat(access(o, cat("size"), "()"))), cat("use 'size()' instead of 'count()/length()'")) a.k.a qt-port-to-std-compatible-api with config Scope: 'Container'. <classes> are: // sequential: "QByteArray", "QList", "QQueue", "QStack", "QString", "QVarLengthArray", "QVector", // associative: "QHash", "QMultiHash", "QMap", "QMultiMap", "QSet", // Qt has no QMultiSet Change-Id: Ibe8837be96e8d30d1846881ecd65180c1bc459af Reviewed-by: Qt CI Bot <qt_ci_bot@qt-project.org> Reviewed-by: Volker Hilsheimer <volker.hilsheimer@qt.io>
2322 lines
80 KiB
C++
2322 lines
80 KiB
C++
// Copyright (C) 2020 The Qt Company Ltd.
|
|
// Copyright (C) 2020 Intel Corporation.
|
|
// SPDX-License-Identifier: LicenseRef-Qt-Commercial OR LGPL-3.0-only OR GPL-2.0-only OR GPL-3.0-only
|
|
|
|
#include <qstringconverter.h>
|
|
#include <private/qstringconverter_p.h>
|
|
#include "qendian.h"
|
|
|
|
#include "private/qsimd_p.h"
|
|
#include "private/qstringiterator_p.h"
|
|
#include "private/qtools_p.h"
|
|
#include "qbytearraymatcher.h"
|
|
|
|
#if QT_CONFIG(icu)
|
|
#include <unicode/ucnv.h>
|
|
#include <unicode/ucnv_cb.h>
|
|
#include <unicode/ucnv_err.h>
|
|
#include <unicode/ustring.h>
|
|
#endif
|
|
|
|
#ifdef Q_OS_WIN
|
|
#include <qt_windows.h>
|
|
#ifndef QT_BOOTSTRAPPED
|
|
#include <QtCore/qvarlengtharray.h>
|
|
#endif // !QT_BOOTSTRAPPED
|
|
#endif
|
|
|
|
#if __has_include(<bit>) && __cplusplus > 201703L
|
|
#include <bit>
|
|
#endif
|
|
|
|
QT_BEGIN_NAMESPACE
|
|
|
|
static_assert(std::is_nothrow_move_constructible_v<QStringEncoder>);
|
|
static_assert(std::is_nothrow_move_assignable_v<QStringEncoder>);
|
|
static_assert(std::is_nothrow_move_constructible_v<QStringDecoder>);
|
|
static_assert(std::is_nothrow_move_assignable_v<QStringDecoder>);
|
|
|
|
enum { Endian = 0, Data = 1 };
|
|
|
|
static const uchar utf8bom[] = { 0xef, 0xbb, 0xbf };
|
|
|
|
#if defined(__SSE2__) || defined(__ARM_NEON__)
|
|
static Q_ALWAYS_INLINE uint qBitScanReverse(unsigned v) noexcept
|
|
{
|
|
#if defined(__cpp_lib_int_pow2) && __cpp_lib_int_pow2 >= 202002L
|
|
return std::bit_width(v) - 1;
|
|
#else
|
|
uint result = qCountLeadingZeroBits(v);
|
|
// Now Invert the result: clz will count *down* from the msb to the lsb, so the msb index is 31
|
|
// and the lsb index is 0. The result for _bit_scan_reverse is expected to be the index when
|
|
// counting up: msb index is 0 (because it starts there), and the lsb index is 31.
|
|
result ^= sizeof(unsigned) * 8 - 1;
|
|
return result;
|
|
#endif
|
|
}
|
|
#endif
|
|
|
|
#if defined(__SSE2__)
|
|
static inline bool simdEncodeAscii(uchar *&dst, const char16_t *&nextAscii, const char16_t *&src, const char16_t *end)
|
|
{
|
|
// do sixteen characters at a time
|
|
for ( ; end - src >= 16; src += 16, dst += 16) {
|
|
# ifdef __AVX2__
|
|
__m256i data = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src));
|
|
__m128i data1 = _mm256_castsi256_si128(data);
|
|
__m128i data2 = _mm256_extracti128_si256(data, 1);
|
|
# else
|
|
__m128i data1 = _mm_loadu_si128((const __m128i*)src);
|
|
__m128i data2 = _mm_loadu_si128(1+(const __m128i*)src);
|
|
# endif
|
|
|
|
// check if everything is ASCII
|
|
// the highest ASCII value is U+007F
|
|
// Do the packing directly:
|
|
// The PACKUSWB instruction has packs a signed 16-bit integer to an unsigned 8-bit
|
|
// with saturation. That is, anything from 0x0100 to 0x7fff is saturated to 0xff,
|
|
// while all negatives (0x8000 to 0xffff) get saturated to 0x00. To detect non-ASCII,
|
|
// we simply do a signed greater-than comparison to 0x00. That means we detect NULs as
|
|
// "non-ASCII", but it's an acceptable compromise.
|
|
__m128i packed = _mm_packus_epi16(data1, data2);
|
|
__m128i nonAscii = _mm_cmpgt_epi8(packed, _mm_setzero_si128());
|
|
|
|
// store, even if there are non-ASCII characters here
|
|
_mm_storeu_si128((__m128i*)dst, packed);
|
|
|
|
// n will contain 1 bit set per character in [data1, data2] that is non-ASCII (or NUL)
|
|
ushort n = ~_mm_movemask_epi8(nonAscii);
|
|
if (n) {
|
|
// find the next probable ASCII character
|
|
// we don't want to load 32 bytes again in this loop if we know there are non-ASCII
|
|
// characters still coming
|
|
nextAscii = src + qBitScanReverse(n) + 1;
|
|
|
|
n = qCountTrailingZeroBits(n);
|
|
dst += n;
|
|
src += n;
|
|
return false;
|
|
}
|
|
}
|
|
|
|
if (end - src >= 8) {
|
|
// do eight characters at a time
|
|
__m128i data = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src));
|
|
__m128i packed = _mm_packus_epi16(data, data);
|
|
__m128i nonAscii = _mm_cmpgt_epi8(packed, _mm_setzero_si128());
|
|
|
|
// store even non-ASCII
|
|
_mm_storel_epi64(reinterpret_cast<__m128i *>(dst), packed);
|
|
|
|
uchar n = ~_mm_movemask_epi8(nonAscii);
|
|
if (n) {
|
|
nextAscii = src + qBitScanReverse(n) + 1;
|
|
n = qCountTrailingZeroBits(n);
|
|
dst += n;
|
|
src += n;
|
|
return false;
|
|
}
|
|
}
|
|
|
|
return src == end;
|
|
}
|
|
|
|
static inline bool simdDecodeAscii(char16_t *&dst, const uchar *&nextAscii, const uchar *&src, const uchar *end)
|
|
{
|
|
// do sixteen characters at a time
|
|
for ( ; end - src >= 16; src += 16, dst += 16) {
|
|
__m128i data = _mm_loadu_si128((const __m128i*)src);
|
|
|
|
#ifdef __AVX2__
|
|
const int BitSpacing = 2;
|
|
// load and zero extend to an YMM register
|
|
const __m256i extended = _mm256_cvtepu8_epi16(data);
|
|
|
|
uint n = _mm256_movemask_epi8(extended);
|
|
if (!n) {
|
|
// store
|
|
_mm256_storeu_si256((__m256i*)dst, extended);
|
|
continue;
|
|
}
|
|
#else
|
|
const int BitSpacing = 1;
|
|
|
|
// check if everything is ASCII
|
|
// movemask extracts the high bit of every byte, so n is non-zero if something isn't ASCII
|
|
uint n = _mm_movemask_epi8(data);
|
|
if (!n) {
|
|
// unpack
|
|
_mm_storeu_si128((__m128i*)dst, _mm_unpacklo_epi8(data, _mm_setzero_si128()));
|
|
_mm_storeu_si128(1+(__m128i*)dst, _mm_unpackhi_epi8(data, _mm_setzero_si128()));
|
|
continue;
|
|
}
|
|
#endif
|
|
|
|
// copy the front part that is still ASCII
|
|
while (!(n & 1)) {
|
|
*dst++ = *src++;
|
|
n >>= BitSpacing;
|
|
}
|
|
|
|
// find the next probable ASCII character
|
|
// we don't want to load 16 bytes again in this loop if we know there are non-ASCII
|
|
// characters still coming
|
|
n = qBitScanReverse(n);
|
|
nextAscii = src + (n / BitSpacing) + 1;
|
|
return false;
|
|
|
|
}
|
|
|
|
if (end - src >= 8) {
|
|
__m128i data = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(src));
|
|
uint n = _mm_movemask_epi8(data) & 0xff;
|
|
if (!n) {
|
|
// unpack and store
|
|
_mm_storeu_si128(reinterpret_cast<__m128i *>(dst), _mm_unpacklo_epi8(data, _mm_setzero_si128()));
|
|
} else {
|
|
while (!(n & 1)) {
|
|
*dst++ = *src++;
|
|
n >>= 1;
|
|
}
|
|
|
|
n = qBitScanReverse(n);
|
|
nextAscii = src + n + 1;
|
|
return false;
|
|
}
|
|
}
|
|
|
|
return src == end;
|
|
}
|
|
|
|
static inline const uchar *simdFindNonAscii(const uchar *src, const uchar *end, const uchar *&nextAscii)
|
|
{
|
|
#ifdef __AVX2__
|
|
// do 32 characters at a time
|
|
// (this is similar to simdTestMask in qstring.cpp)
|
|
const __m256i mask = _mm256_set1_epi8(0x80);
|
|
for ( ; end - src >= 32; src += 32) {
|
|
__m256i data = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src));
|
|
if (_mm256_testz_si256(mask, data))
|
|
continue;
|
|
|
|
uint n = _mm256_movemask_epi8(data);
|
|
Q_ASSUME(n);
|
|
|
|
// find the next probable ASCII character
|
|
// we don't want to load 32 bytes again in this loop if we know there are non-ASCII
|
|
// characters still coming
|
|
nextAscii = src + qBitScanReverse(n) + 1;
|
|
|
|
// return the non-ASCII character
|
|
return src + qCountTrailingZeroBits(n);
|
|
}
|
|
#endif
|
|
|
|
// do sixteen characters at a time
|
|
for ( ; end - src >= 16; src += 16) {
|
|
__m128i data = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src));
|
|
|
|
// check if everything is ASCII
|
|
// movemask extracts the high bit of every byte, so n is non-zero if something isn't ASCII
|
|
uint n = _mm_movemask_epi8(data);
|
|
if (!n)
|
|
continue;
|
|
|
|
// find the next probable ASCII character
|
|
// we don't want to load 16 bytes again in this loop if we know there are non-ASCII
|
|
// characters still coming
|
|
nextAscii = src + qBitScanReverse(n) + 1;
|
|
|
|
// return the non-ASCII character
|
|
return src + qCountTrailingZeroBits(n);
|
|
}
|
|
|
|
// do four characters at a time
|
|
for ( ; end - src >= 4; src += 4) {
|
|
quint32 data = qFromUnaligned<quint32>(src);
|
|
data &= 0x80808080U;
|
|
if (!data)
|
|
continue;
|
|
|
|
// We don't try to guess which of the three bytes is ASCII and which
|
|
// one isn't. The chance that at least two of them are non-ASCII is
|
|
// better than 75%.
|
|
nextAscii = src;
|
|
return src;
|
|
}
|
|
nextAscii = end;
|
|
return src;
|
|
}
|
|
|
|
// Compare only the US-ASCII beginning of [src8, end8) and [src16, end16)
|
|
// and advance src8 and src16 to the first character that could not be compared
|
|
static void simdCompareAscii(const qchar8_t *&src8, const qchar8_t *end8, const char16_t *&src16, const char16_t *end16)
|
|
{
|
|
int bitSpacing = 1;
|
|
qptrdiff len = qMin(end8 - src8, end16 - src16);
|
|
qptrdiff offset = 0;
|
|
uint mask = 0;
|
|
|
|
// do sixteen characters at a time
|
|
for ( ; offset + 16 < len; offset += 16) {
|
|
__m128i data8 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src8 + offset));
|
|
#ifdef __AVX2__
|
|
// AVX2 version, use 256-bit registers and VPMOVXZBW
|
|
__m256i data16 = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src16 + offset));
|
|
|
|
// expand US-ASCII as if it were Latin1 and confirm it's US-ASCII
|
|
__m256i datax8 = _mm256_cvtepu8_epi16(data8);
|
|
mask = _mm256_movemask_epi8(datax8);
|
|
if (mask)
|
|
break;
|
|
|
|
// compare Latin1 to UTF-16
|
|
__m256i latin1cmp = _mm256_cmpeq_epi16(datax8, data16);
|
|
mask = ~_mm256_movemask_epi8(latin1cmp);
|
|
if (mask)
|
|
break;
|
|
#else
|
|
// non-AVX2 code
|
|
__m128i datalo16 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src16 + offset));
|
|
__m128i datahi16 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src16 + offset) + 1);
|
|
|
|
// expand US-ASCII as if it were Latin1, we'll confirm later
|
|
__m128i datalo8 = _mm_unpacklo_epi8(data8, _mm_setzero_si128());
|
|
__m128i datahi8 = _mm_unpackhi_epi8(data8, _mm_setzero_si128());
|
|
|
|
// compare Latin1 to UTF-16
|
|
__m128i latin1cmplo = _mm_cmpeq_epi16(datalo8, datalo16);
|
|
__m128i latin1cmphi = _mm_cmpeq_epi16(datahi8, datahi16);
|
|
mask = _mm_movemask_epi8(latin1cmphi) << 16;
|
|
mask |= ushort(_mm_movemask_epi8(latin1cmplo));
|
|
mask = ~mask;
|
|
if (mask)
|
|
break;
|
|
|
|
// confirm it was US-ASCII
|
|
mask = _mm_movemask_epi8(data8);
|
|
if (mask) {
|
|
bitSpacing = 0;
|
|
break;
|
|
}
|
|
#endif
|
|
}
|
|
|
|
// helper for comparing 4 or 8 characters
|
|
auto cmp_lt_16 = [&mask, &offset](int n, __m128i data8, __m128i data16) {
|
|
// n = 4 -> sizemask = 0xff
|
|
// n = 8 -> sizemask = 0xffff
|
|
unsigned sizemask = (1U << (2 * n)) - 1;
|
|
|
|
// expand as if Latin1
|
|
data8 = _mm_unpacklo_epi8(data8, _mm_setzero_si128());
|
|
|
|
// compare and confirm it's US-ASCII
|
|
__m128i latin1cmp = _mm_cmpeq_epi16(data8, data16);
|
|
mask = ~_mm_movemask_epi8(latin1cmp) & sizemask;
|
|
mask |= _mm_movemask_epi8(data8);
|
|
if (mask == 0)
|
|
offset += n;
|
|
};
|
|
|
|
// do eight characters at a time
|
|
if (mask == 0 && offset + 8 < len) {
|
|
__m128i data8 = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(src8 + offset));
|
|
__m128i data16 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src16 + offset));
|
|
cmp_lt_16(8, data8, data16);
|
|
}
|
|
|
|
// do four characters
|
|
if (mask == 0 && offset + 4 < len) {
|
|
__m128i data8 = _mm_cvtsi32_si128(qFromUnaligned<quint32>(src8 + offset));
|
|
__m128i data16 = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(src16 + offset));
|
|
cmp_lt_16(4, data8, data16);
|
|
}
|
|
|
|
// correct the source pointers to point to the first character we couldn't deal with
|
|
if (mask)
|
|
offset += qCountTrailingZeroBits(mask) >> bitSpacing;
|
|
src8 += offset;
|
|
src16 += offset;
|
|
}
|
|
#elif defined(__ARM_NEON__)
|
|
static inline bool simdEncodeAscii(uchar *&dst, const char16_t *&nextAscii, const char16_t *&src, const char16_t *end)
|
|
{
|
|
uint16x8_t maxAscii = vdupq_n_u16(0x7f);
|
|
uint16x8_t mask1 = { 1, 1 << 2, 1 << 4, 1 << 6, 1 << 8, 1 << 10, 1 << 12, 1 << 14 };
|
|
uint16x8_t mask2 = vshlq_n_u16(mask1, 1);
|
|
|
|
// do sixteen characters at a time
|
|
for ( ; end - src >= 16; src += 16, dst += 16) {
|
|
// load 2 lanes (or: "load interleaved")
|
|
uint16x8x2_t in = vld2q_u16(reinterpret_cast<const uint16_t *>(src));
|
|
|
|
// check if any of the elements > 0x7f, select 1 bit per element (element 0 -> bit 0, element 1 -> bit 1, etc),
|
|
// add those together into a scalar, and merge the scalars.
|
|
uint16_t nonAscii = vaddvq_u16(vandq_u16(vcgtq_u16(in.val[0], maxAscii), mask1))
|
|
| vaddvq_u16(vandq_u16(vcgtq_u16(in.val[1], maxAscii), mask2));
|
|
|
|
// merge the two lanes by shifting the values of the second by 8 and inserting them
|
|
uint16x8_t out = vsliq_n_u16(in.val[0], in.val[1], 8);
|
|
|
|
// store, even if there are non-ASCII characters here
|
|
vst1q_u8(dst, vreinterpretq_u8_u16(out));
|
|
|
|
if (nonAscii) {
|
|
// find the next probable ASCII character
|
|
// we don't want to load 32 bytes again in this loop if we know there are non-ASCII
|
|
// characters still coming
|
|
nextAscii = src + qBitScanReverse(nonAscii) + 1;
|
|
|
|
nonAscii = qCountTrailingZeroBits(nonAscii);
|
|
dst += nonAscii;
|
|
src += nonAscii;
|
|
return false;
|
|
}
|
|
}
|
|
return src == end;
|
|
}
|
|
|
|
static inline bool simdDecodeAscii(char16_t *&dst, const uchar *&nextAscii, const uchar *&src, const uchar *end)
|
|
{
|
|
// do eight characters at a time
|
|
uint8x8_t msb_mask = vdup_n_u8(0x80);
|
|
uint8x8_t add_mask = { 1, 1 << 1, 1 << 2, 1 << 3, 1 << 4, 1 << 5, 1 << 6, 1 << 7 };
|
|
for ( ; end - src >= 8; src += 8, dst += 8) {
|
|
uint8x8_t c = vld1_u8(src);
|
|
uint8_t n = vaddv_u8(vand_u8(vcge_u8(c, msb_mask), add_mask));
|
|
if (!n) {
|
|
// store
|
|
vst1q_u16(reinterpret_cast<uint16_t *>(dst), vmovl_u8(c));
|
|
continue;
|
|
}
|
|
|
|
// copy the front part that is still ASCII
|
|
while (!(n & 1)) {
|
|
*dst++ = *src++;
|
|
n >>= 1;
|
|
}
|
|
|
|
// find the next probable ASCII character
|
|
// we don't want to load 16 bytes again in this loop if we know there are non-ASCII
|
|
// characters still coming
|
|
n = qBitScanReverse(n);
|
|
nextAscii = src + n + 1;
|
|
return false;
|
|
|
|
}
|
|
return src == end;
|
|
}
|
|
|
|
static inline const uchar *simdFindNonAscii(const uchar *src, const uchar *end, const uchar *&nextAscii)
|
|
{
|
|
// The SIMD code below is untested, so just force an early return until
|
|
// we've had the time to verify it works.
|
|
nextAscii = end;
|
|
return src;
|
|
|
|
// do eight characters at a time
|
|
uint8x8_t msb_mask = vdup_n_u8(0x80);
|
|
uint8x8_t add_mask = { 1, 1 << 1, 1 << 2, 1 << 3, 1 << 4, 1 << 5, 1 << 6, 1 << 7 };
|
|
for ( ; end - src >= 8; src += 8) {
|
|
uint8x8_t c = vld1_u8(src);
|
|
uint8_t n = vaddv_u8(vand_u8(vcge_u8(c, msb_mask), add_mask));
|
|
if (!n)
|
|
continue;
|
|
|
|
// find the next probable ASCII character
|
|
// we don't want to load 16 bytes again in this loop if we know there are non-ASCII
|
|
// characters still coming
|
|
nextAscii = src + qBitScanReverse(n) + 1;
|
|
|
|
// return the non-ASCII character
|
|
return src + qCountTrailingZeroBits(n);
|
|
}
|
|
nextAscii = end;
|
|
return src;
|
|
}
|
|
|
|
static void simdCompareAscii(const qchar8_t *&, const qchar8_t *, const char16_t *&, const char16_t *)
|
|
{
|
|
}
|
|
#else
|
|
static inline bool simdEncodeAscii(uchar *, const char16_t *, const char16_t *, const char16_t *)
|
|
{
|
|
return false;
|
|
}
|
|
|
|
static inline bool simdDecodeAscii(char16_t *, const uchar *, const uchar *, const uchar *)
|
|
{
|
|
return false;
|
|
}
|
|
|
|
static inline const uchar *simdFindNonAscii(const uchar *src, const uchar *end, const uchar *&nextAscii)
|
|
{
|
|
nextAscii = end;
|
|
return src;
|
|
}
|
|
|
|
static void simdCompareAscii(const qchar8_t *&, const qchar8_t *, const char16_t *&, const char16_t *)
|
|
{
|
|
}
|
|
#endif
|
|
|
|
enum { HeaderDone = 1 };
|
|
|
|
QByteArray QUtf8::convertFromUnicode(QStringView in)
|
|
{
|
|
qsizetype len = in.size();
|
|
|
|
// create a QByteArray with the worst case scenario size
|
|
QByteArray result(len * 3, Qt::Uninitialized);
|
|
uchar *dst = reinterpret_cast<uchar *>(const_cast<char *>(result.constData()));
|
|
const char16_t *src = reinterpret_cast<const char16_t *>(in.data());
|
|
const char16_t *const end = src + len;
|
|
|
|
while (src != end) {
|
|
const char16_t *nextAscii = end;
|
|
if (simdEncodeAscii(dst, nextAscii, src, end))
|
|
break;
|
|
|
|
do {
|
|
char16_t u = *src++;
|
|
int res = QUtf8Functions::toUtf8<QUtf8BaseTraits>(u, dst, src, end);
|
|
if (res < 0) {
|
|
// encoding error - append '?'
|
|
*dst++ = '?';
|
|
}
|
|
} while (src < nextAscii);
|
|
}
|
|
|
|
result.truncate(dst - reinterpret_cast<uchar *>(const_cast<char *>(result.constData())));
|
|
return result;
|
|
}
|
|
|
|
QByteArray QUtf8::convertFromUnicode(QStringView in, QStringConverterBase::State *state)
|
|
{
|
|
QByteArray ba(3*in.size() +3, Qt::Uninitialized);
|
|
char *end = convertFromUnicode(ba.data(), in, state);
|
|
ba.truncate(end - ba.data());
|
|
return ba;
|
|
}
|
|
|
|
char *QUtf8::convertFromUnicode(char *out, QStringView in, QStringConverter::State *state)
|
|
{
|
|
Q_ASSERT(state);
|
|
const QChar *uc = in.data();
|
|
qsizetype len = in.length();
|
|
if (!len)
|
|
return out;
|
|
|
|
auto appendReplacementChar = [state](uchar *cursor) -> uchar * {
|
|
if (state->flags & QStringConverter::Flag::ConvertInvalidToNull) {
|
|
*cursor++ = 0;
|
|
} else {
|
|
// QChar::replacement encoded in utf8
|
|
*cursor++ = 0xef;
|
|
*cursor++ = 0xbf;
|
|
*cursor++ = 0xbd;
|
|
}
|
|
return cursor;
|
|
};
|
|
|
|
uchar *cursor = reinterpret_cast<uchar *>(out);
|
|
const char16_t *src = reinterpret_cast<const char16_t *>(uc);
|
|
const char16_t *const end = src + len;
|
|
|
|
if (!(state->flags & QStringDecoder::Flag::Stateless)) {
|
|
if (state->remainingChars) {
|
|
int res = QUtf8Functions::toUtf8<QUtf8BaseTraits>(state->state_data[0], cursor, src, end);
|
|
if (res < 0)
|
|
cursor = appendReplacementChar(cursor);
|
|
state->state_data[0] = 0;
|
|
state->remainingChars = 0;
|
|
} else if (!(state->internalState & HeaderDone) && state->flags & QStringConverter::Flag::WriteBom) {
|
|
// append UTF-8 BOM
|
|
*cursor++ = utf8bom[0];
|
|
*cursor++ = utf8bom[1];
|
|
*cursor++ = utf8bom[2];
|
|
state->internalState |= HeaderDone;
|
|
}
|
|
}
|
|
|
|
while (src != end) {
|
|
const char16_t *nextAscii = end;
|
|
if (simdEncodeAscii(cursor, nextAscii, src, end))
|
|
break;
|
|
|
|
do {
|
|
char16_t uc = *src++;
|
|
int res = QUtf8Functions::toUtf8<QUtf8BaseTraits>(uc, cursor, src, end);
|
|
if (Q_LIKELY(res >= 0))
|
|
continue;
|
|
|
|
if (res == QUtf8BaseTraits::Error) {
|
|
// encoding error
|
|
++state->invalidChars;
|
|
cursor = appendReplacementChar(cursor);
|
|
} else if (res == QUtf8BaseTraits::EndOfString) {
|
|
if (state->flags & QStringConverter::Flag::Stateless) {
|
|
++state->invalidChars;
|
|
cursor = appendReplacementChar(cursor);
|
|
} else {
|
|
state->remainingChars = 1;
|
|
state->state_data[0] = uc;
|
|
}
|
|
return reinterpret_cast<char *>(cursor);
|
|
}
|
|
} while (src < nextAscii);
|
|
}
|
|
|
|
return reinterpret_cast<char *>(cursor);
|
|
}
|
|
|
|
QString QUtf8::convertToUnicode(QByteArrayView in)
|
|
{
|
|
// UTF-8 to UTF-16 always needs the exact same number of words or less:
|
|
// UTF-8 UTF-16
|
|
// 1 byte 1 word
|
|
// 2 bytes 1 word
|
|
// 3 bytes 1 word
|
|
// 4 bytes 2 words (one surrogate pair)
|
|
// That is, we'll use the full buffer if the input is US-ASCII (1-byte UTF-8),
|
|
// half the buffer for U+0080-U+07FF text (e.g., Greek, Cyrillic, Arabic) or
|
|
// non-BMP text, and one third of the buffer for U+0800-U+FFFF text (e.g, CJK).
|
|
//
|
|
// The table holds for invalid sequences too: we'll insert one replacement char
|
|
// per invalid byte.
|
|
QString result(in.size(), Qt::Uninitialized);
|
|
QChar *data = const_cast<QChar*>(result.constData()); // we know we're not shared
|
|
const QChar *end = convertToUnicode(data, in);
|
|
result.truncate(end - data);
|
|
return result;
|
|
}
|
|
|
|
/*!
|
|
\since 5.7
|
|
\overload
|
|
|
|
Converts the UTF-8 sequence of bytes viewed by \a in to a sequence of
|
|
QChar starting at \a buffer. The buffer is expected to be large enough
|
|
to hold the result. An upper bound for the size of the buffer is
|
|
\c in.size() QChars.
|
|
|
|
If, during decoding, an error occurs, a QChar::ReplacementCharacter is
|
|
written.
|
|
|
|
Returns a pointer to one past the last QChar written.
|
|
|
|
This function never throws.
|
|
*/
|
|
|
|
QChar *QUtf8::convertToUnicode(QChar *buffer, QByteArrayView in) noexcept
|
|
{
|
|
char16_t *dst = reinterpret_cast<char16_t *>(buffer);
|
|
const uchar *const start = reinterpret_cast<const uchar *>(in.data());
|
|
const uchar *src = start;
|
|
const uchar *end = src + in.size();
|
|
|
|
// attempt to do a full decoding in SIMD
|
|
const uchar *nextAscii = end;
|
|
if (!simdDecodeAscii(dst, nextAscii, src, end)) {
|
|
// at least one non-ASCII entry
|
|
// check if we failed to decode the UTF-8 BOM; if so, skip it
|
|
if (Q_UNLIKELY(src == start)
|
|
&& end - src >= 3
|
|
&& Q_UNLIKELY(src[0] == utf8bom[0] && src[1] == utf8bom[1] && src[2] == utf8bom[2])) {
|
|
src += 3;
|
|
}
|
|
|
|
while (src < end) {
|
|
nextAscii = end;
|
|
if (simdDecodeAscii(dst, nextAscii, src, end))
|
|
break;
|
|
|
|
do {
|
|
uchar b = *src++;
|
|
int res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(b, dst, src, end);
|
|
if (res < 0) {
|
|
// decoding error
|
|
*dst++ = QChar::ReplacementCharacter;
|
|
}
|
|
} while (src < nextAscii);
|
|
}
|
|
}
|
|
|
|
return reinterpret_cast<QChar *>(dst);
|
|
}
|
|
|
|
QString QUtf8::convertToUnicode(QByteArrayView in, QStringConverter::State *state)
|
|
{
|
|
// See above for buffer requirements for stateless decoding. However, that
|
|
// fails if the state is not empty. The following situations can add to the
|
|
// requirements:
|
|
// state contains chars starts with requirement
|
|
// 1 of 2 bytes valid continuation 0
|
|
// 2 of 3 bytes same 0
|
|
// 3 bytes of 4 same +1 (need to insert surrogate pair)
|
|
// 1 of 2 bytes invalid continuation +1 (need to insert replacement and restart)
|
|
// 2 of 3 bytes same +1 (same)
|
|
// 3 of 4 bytes same +1 (same)
|
|
QString result(in.size() + 1, Qt::Uninitialized);
|
|
QChar *end = convertToUnicode(result.data(), in, state);
|
|
result.truncate(end - result.constData());
|
|
return result;
|
|
}
|
|
|
|
QChar *QUtf8::convertToUnicode(QChar *out, QByteArrayView in, QStringConverter::State *state)
|
|
{
|
|
qsizetype len = in.size();
|
|
|
|
Q_ASSERT(state);
|
|
if (!len)
|
|
return out;
|
|
|
|
|
|
char16_t replacement = QChar::ReplacementCharacter;
|
|
if (state->flags & QStringConverter::Flag::ConvertInvalidToNull)
|
|
replacement = QChar::Null;
|
|
|
|
int res;
|
|
uchar ch = 0;
|
|
|
|
char16_t *dst = reinterpret_cast<char16_t *>(out);
|
|
const uchar *src = reinterpret_cast<const uchar *>(in.data());
|
|
const uchar *end = src + len;
|
|
|
|
if (!(state->flags & QStringConverter::Flag::Stateless)) {
|
|
bool headerdone = state->internalState & HeaderDone || state->flags & QStringConverter::Flag::ConvertInitialBom;
|
|
if (state->remainingChars || !headerdone) {
|
|
// handle incoming state first
|
|
uchar remainingCharsData[4]; // longest UTF-8 sequence possible
|
|
qsizetype remainingCharsCount = state->remainingChars;
|
|
qsizetype newCharsToCopy = qMin<qsizetype>(sizeof(remainingCharsData) - remainingCharsCount, end - src);
|
|
|
|
memset(remainingCharsData, 0, sizeof(remainingCharsData));
|
|
memcpy(remainingCharsData, &state->state_data[0], remainingCharsCount);
|
|
memcpy(remainingCharsData + remainingCharsCount, src, newCharsToCopy);
|
|
|
|
const uchar *begin = &remainingCharsData[1];
|
|
res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(remainingCharsData[0], dst, begin,
|
|
static_cast<const uchar *>(remainingCharsData) + remainingCharsCount + newCharsToCopy);
|
|
if (res == QUtf8BaseTraits::Error) {
|
|
++state->invalidChars;
|
|
*dst++ = replacement;
|
|
++src;
|
|
} else if (res == QUtf8BaseTraits::EndOfString) {
|
|
// if we got EndOfString again, then there were too few bytes in src;
|
|
// copy to our state and return
|
|
state->remainingChars = remainingCharsCount + newCharsToCopy;
|
|
memcpy(&state->state_data[0], remainingCharsData, state->remainingChars);
|
|
return out;
|
|
} else if (!headerdone) {
|
|
// eat the UTF-8 BOM
|
|
if (dst[-1] == 0xfeff)
|
|
--dst;
|
|
}
|
|
state->internalState |= HeaderDone;
|
|
|
|
// adjust src now that we have maybe consumed a few chars
|
|
if (res >= 0) {
|
|
Q_ASSERT(res > remainingCharsCount);
|
|
src += res - remainingCharsCount;
|
|
}
|
|
}
|
|
} else if (!(state->flags & QStringConverter::Flag::ConvertInitialBom)) {
|
|
// stateless, remove initial BOM
|
|
if (len > 2 && src[0] == utf8bom[0] && src[1] == utf8bom[1] && src[2] == utf8bom[2])
|
|
// skip BOM
|
|
src += 3;
|
|
}
|
|
|
|
// main body, stateless decoding
|
|
res = 0;
|
|
const uchar *nextAscii = src;
|
|
while (res >= 0 && src < end) {
|
|
if (src >= nextAscii && simdDecodeAscii(dst, nextAscii, src, end))
|
|
break;
|
|
|
|
ch = *src++;
|
|
res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(ch, dst, src, end);
|
|
if (res == QUtf8BaseTraits::Error) {
|
|
res = 0;
|
|
++state->invalidChars;
|
|
*dst++ = replacement;
|
|
}
|
|
}
|
|
|
|
if (res == QUtf8BaseTraits::EndOfString) {
|
|
// unterminated UTF sequence
|
|
if (state->flags & QStringConverter::Flag::Stateless) {
|
|
*dst++ = QChar::ReplacementCharacter;
|
|
++state->invalidChars;
|
|
while (src++ < end) {
|
|
*dst++ = QChar::ReplacementCharacter;
|
|
++state->invalidChars;
|
|
}
|
|
state->remainingChars = 0;
|
|
} else {
|
|
--src; // unread the byte in ch
|
|
state->remainingChars = end - src;
|
|
memcpy(&state->state_data[0], src, end - src);
|
|
}
|
|
} else {
|
|
state->remainingChars = 0;
|
|
}
|
|
|
|
return reinterpret_cast<QChar *>(dst);
|
|
}
|
|
|
|
struct QUtf8NoOutputTraits : public QUtf8BaseTraitsNoAscii
|
|
{
|
|
struct NoOutput {};
|
|
static void appendUtf16(const NoOutput &, char16_t) {}
|
|
static void appendUcs4(const NoOutput &, char32_t) {}
|
|
};
|
|
|
|
QUtf8::ValidUtf8Result QUtf8::isValidUtf8(QByteArrayView in)
|
|
{
|
|
const uchar *src = reinterpret_cast<const uchar *>(in.data());
|
|
const uchar *end = src + in.size();
|
|
const uchar *nextAscii = src;
|
|
bool isValidAscii = true;
|
|
|
|
while (src < end) {
|
|
if (src >= nextAscii)
|
|
src = simdFindNonAscii(src, end, nextAscii);
|
|
if (src == end)
|
|
break;
|
|
|
|
do {
|
|
uchar b = *src++;
|
|
if ((b & 0x80) == 0)
|
|
continue;
|
|
|
|
isValidAscii = false;
|
|
QUtf8NoOutputTraits::NoOutput output;
|
|
int res = QUtf8Functions::fromUtf8<QUtf8NoOutputTraits>(b, output, src, end);
|
|
if (res < 0) {
|
|
// decoding error
|
|
return { false, false };
|
|
}
|
|
} while (src < nextAscii);
|
|
}
|
|
|
|
return { true, isValidAscii };
|
|
}
|
|
|
|
int QUtf8::compareUtf8(QByteArrayView utf8, QStringView utf16) noexcept
|
|
{
|
|
auto src1 = reinterpret_cast<const qchar8_t *>(utf8.data());
|
|
auto end1 = src1 + utf8.size();
|
|
auto src2 = reinterpret_cast<const char16_t *>(utf16.data());
|
|
auto end2 = src2 + utf16.size();
|
|
|
|
do {
|
|
simdCompareAscii(src1, end1, src2, end2);
|
|
|
|
if (src1 < end1 && src2 < end2) {
|
|
char32_t uc1 = *src1++;
|
|
char32_t uc2 = *src2++;
|
|
|
|
if (uc1 >= 0x80) {
|
|
char32_t *output = &uc1;
|
|
int res = QUtf8Functions::fromUtf8<QUtf8BaseTraitsNoAscii>(uc1, output, src1, end1);
|
|
if (res < 0) {
|
|
// decoding error
|
|
uc1 = QChar::ReplacementCharacter;
|
|
}
|
|
|
|
// Only decode the UTF-16 surrogate pair if the UTF-8 code point
|
|
// wasn't US-ASCII (a surrogate cannot match US-ASCII).
|
|
if (QChar::isHighSurrogate(uc2) && src2 < end2 && QChar::isLowSurrogate(*src2))
|
|
uc2 = QChar::surrogateToUcs4(uc2, *src2++);
|
|
}
|
|
|
|
if (uc1 != uc2)
|
|
return int(uc1) - int(uc2);
|
|
}
|
|
} while (src1 < end1 && src2 < end2);
|
|
|
|
// the shorter string sorts first
|
|
return (end1 > src1) - int(end2 > src2);
|
|
}
|
|
|
|
int QUtf8::compareUtf8(QByteArrayView utf8, QLatin1StringView s)
|
|
{
|
|
char32_t uc1 = QChar::Null;
|
|
auto src1 = reinterpret_cast<const uchar *>(utf8.data());
|
|
auto end1 = src1 + utf8.size();
|
|
auto src2 = reinterpret_cast<const uchar *>(s.latin1());
|
|
auto end2 = src2 + s.size();
|
|
|
|
while (src1 < end1 && src2 < end2) {
|
|
uchar b = *src1++;
|
|
char32_t *output = &uc1;
|
|
int res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(b, output, src1, end1);
|
|
if (res < 0) {
|
|
// decoding error
|
|
uc1 = QChar::ReplacementCharacter;
|
|
}
|
|
|
|
char32_t uc2 = *src2++;
|
|
if (uc1 != uc2)
|
|
return int(uc1) - int(uc2);
|
|
}
|
|
|
|
// the shorter string sorts first
|
|
return (end1 > src1) - (end2 > src2);
|
|
}
|
|
|
|
QByteArray QUtf16::convertFromUnicode(QStringView in, QStringConverter::State *state, DataEndianness endian)
|
|
{
|
|
bool writeBom = !(state->internalState & HeaderDone) && state->flags & QStringConverter::Flag::WriteBom;
|
|
qsizetype length = 2 * in.size();
|
|
if (writeBom)
|
|
length += 2;
|
|
|
|
QByteArray d(length, Qt::Uninitialized);
|
|
char *end = convertFromUnicode(d.data(), in, state, endian);
|
|
Q_ASSERT(end - d.constData() == d.size());
|
|
Q_UNUSED(end);
|
|
return d;
|
|
}
|
|
|
|
char *QUtf16::convertFromUnicode(char *out, QStringView in, QStringConverter::State *state, DataEndianness endian)
|
|
{
|
|
Q_ASSERT(state);
|
|
bool writeBom = !(state->internalState & HeaderDone) && state->flags & QStringConverter::Flag::WriteBom;
|
|
|
|
if (endian == DetectEndianness)
|
|
endian = (QSysInfo::ByteOrder == QSysInfo::BigEndian) ? BigEndianness : LittleEndianness;
|
|
|
|
if (writeBom) {
|
|
// set them up the BOM
|
|
QChar bom(QChar::ByteOrderMark);
|
|
if (endian == BigEndianness)
|
|
qToBigEndian(bom.unicode(), out);
|
|
else
|
|
qToLittleEndian(bom.unicode(), out);
|
|
out += 2;
|
|
}
|
|
if (endian == BigEndianness)
|
|
qToBigEndian<char16_t>(in.data(), in.length(), out);
|
|
else
|
|
qToLittleEndian<char16_t>(in.data(), in.length(), out);
|
|
|
|
state->remainingChars = 0;
|
|
state->internalState |= HeaderDone;
|
|
return out + 2*in.length();
|
|
}
|
|
|
|
QString QUtf16::convertToUnicode(QByteArrayView in, QStringConverter::State *state, DataEndianness endian)
|
|
{
|
|
QString result((in.size() + 1) >> 1, Qt::Uninitialized); // worst case
|
|
QChar *qch = convertToUnicode(result.data(), in, state, endian);
|
|
result.truncate(qch - result.constData());
|
|
return result;
|
|
}
|
|
|
|
QChar *QUtf16::convertToUnicode(QChar *out, QByteArrayView in, QStringConverter::State *state, DataEndianness endian)
|
|
{
|
|
qsizetype len = in.size();
|
|
const char *chars = in.data();
|
|
|
|
Q_ASSERT(state);
|
|
|
|
if (endian == DetectEndianness)
|
|
endian = (DataEndianness)state->state_data[Endian];
|
|
|
|
const char *end = chars + len;
|
|
|
|
// make sure we can decode at least one char
|
|
if (state->remainingChars + len < 2) {
|
|
if (len) {
|
|
Q_ASSERT(state->remainingChars == 0 && len == 1);
|
|
state->remainingChars = 1;
|
|
state->state_data[Data] = *chars;
|
|
}
|
|
return out;
|
|
}
|
|
|
|
bool headerdone = state && state->internalState & HeaderDone;
|
|
if (state->flags & QStringConverter::Flag::ConvertInitialBom)
|
|
headerdone = true;
|
|
|
|
if (!headerdone || state->remainingChars) {
|
|
uchar buf;
|
|
if (state->remainingChars)
|
|
buf = state->state_data[Data];
|
|
else
|
|
buf = *chars++;
|
|
|
|
// detect BOM, set endianness
|
|
state->internalState |= HeaderDone;
|
|
QChar ch(buf, *chars++);
|
|
if (endian == DetectEndianness) {
|
|
// someone set us up the BOM
|
|
if (ch == QChar::ByteOrderSwapped) {
|
|
endian = BigEndianness;
|
|
} else if (ch == QChar::ByteOrderMark) {
|
|
endian = LittleEndianness;
|
|
} else {
|
|
if (QSysInfo::ByteOrder == QSysInfo::BigEndian) {
|
|
endian = BigEndianness;
|
|
} else {
|
|
endian = LittleEndianness;
|
|
}
|
|
}
|
|
}
|
|
if (endian == BigEndianness)
|
|
ch = QChar::fromUcs2((ch.unicode() >> 8) | ((ch.unicode() & 0xff) << 8));
|
|
if (headerdone || ch != QChar::ByteOrderMark)
|
|
*out++ = ch;
|
|
} else if (endian == DetectEndianness) {
|
|
endian = (QSysInfo::ByteOrder == QSysInfo::BigEndian) ? BigEndianness : LittleEndianness;
|
|
}
|
|
|
|
qsizetype nPairs = (end - chars) >> 1;
|
|
if (endian == BigEndianness)
|
|
qFromBigEndian<char16_t>(chars, nPairs, out);
|
|
else
|
|
qFromLittleEndian<char16_t>(chars, nPairs, out);
|
|
out += nPairs;
|
|
|
|
state->state_data[Endian] = endian;
|
|
state->remainingChars = 0;
|
|
if ((end - chars) & 1) {
|
|
if (state->flags & QStringConverter::Flag::Stateless) {
|
|
*out++ = state->flags & QStringConverter::Flag::ConvertInvalidToNull ? QChar::Null : QChar::ReplacementCharacter;
|
|
} else {
|
|
state->remainingChars = 1;
|
|
state->state_data[Data] = *(end - 1);
|
|
}
|
|
} else {
|
|
state->state_data[Data] = 0;
|
|
}
|
|
|
|
return out;
|
|
}
|
|
|
|
QByteArray QUtf32::convertFromUnicode(QStringView in, QStringConverter::State *state, DataEndianness endian)
|
|
{
|
|
bool writeBom = !(state->internalState & HeaderDone) && state->flags & QStringConverter::Flag::WriteBom;
|
|
qsizetype length = 4*in.size();
|
|
if (writeBom)
|
|
length += 4;
|
|
QByteArray ba(length, Qt::Uninitialized);
|
|
char *end = convertFromUnicode(ba.data(), in, state, endian);
|
|
ba.truncate(end - ba.constData());
|
|
return ba;
|
|
}
|
|
|
|
char *QUtf32::convertFromUnicode(char *out, QStringView in, QStringConverter::State *state, DataEndianness endian)
|
|
{
|
|
Q_ASSERT(state);
|
|
|
|
bool writeBom = !(state->internalState & HeaderDone) && state->flags & QStringConverter::Flag::WriteBom;
|
|
if (endian == DetectEndianness)
|
|
endian = (QSysInfo::ByteOrder == QSysInfo::BigEndian) ? BigEndianness : LittleEndianness;
|
|
|
|
if (writeBom) {
|
|
// set them up the BOM
|
|
if (endian == BigEndianness) {
|
|
out[0] = 0;
|
|
out[1] = 0;
|
|
out[2] = (char)0xfe;
|
|
out[3] = (char)0xff;
|
|
} else {
|
|
out[0] = (char)0xff;
|
|
out[1] = (char)0xfe;
|
|
out[2] = 0;
|
|
out[3] = 0;
|
|
}
|
|
out += 4;
|
|
state->internalState |= HeaderDone;
|
|
}
|
|
|
|
const QChar *uc = in.data();
|
|
const QChar *end = in.data() + in.length();
|
|
QChar ch;
|
|
char32_t ucs4;
|
|
if (state->remainingChars == 1) {
|
|
auto character = state->state_data[Data];
|
|
Q_ASSERT(character <= 0xFFFF);
|
|
ch = QChar(character);
|
|
// this is ugly, but shortcuts a whole lot of logic that would otherwise be required
|
|
state->remainingChars = 0;
|
|
goto decode_surrogate;
|
|
}
|
|
|
|
while (uc < end) {
|
|
ch = *uc++;
|
|
if (Q_LIKELY(!ch.isSurrogate())) {
|
|
ucs4 = ch.unicode();
|
|
} else if (Q_LIKELY(ch.isHighSurrogate())) {
|
|
decode_surrogate:
|
|
if (uc == end) {
|
|
if (state->flags & QStringConverter::Flag::Stateless) {
|
|
ucs4 = state->flags & QStringConverter::Flag::ConvertInvalidToNull ? 0 : QChar::ReplacementCharacter;
|
|
} else {
|
|
state->remainingChars = 1;
|
|
state->state_data[Data] = ch.unicode();
|
|
return out;
|
|
}
|
|
} else if (uc->isLowSurrogate()) {
|
|
ucs4 = QChar::surrogateToUcs4(ch, *uc++);
|
|
} else {
|
|
ucs4 = state->flags & QStringConverter::Flag::ConvertInvalidToNull ? 0 : QChar::ReplacementCharacter;
|
|
}
|
|
} else {
|
|
ucs4 = state->flags & QStringConverter::Flag::ConvertInvalidToNull ? 0 : QChar::ReplacementCharacter;
|
|
}
|
|
if (endian == BigEndianness)
|
|
qToBigEndian(ucs4, out);
|
|
else
|
|
qToLittleEndian(ucs4, out);
|
|
out += 4;
|
|
}
|
|
|
|
return out;
|
|
}
|
|
|
|
QString QUtf32::convertToUnicode(QByteArrayView in, QStringConverter::State *state, DataEndianness endian)
|
|
{
|
|
QString result;
|
|
result.resize((in.size() + 7) >> 1); // worst case
|
|
QChar *end = convertToUnicode(result.data(), in, state, endian);
|
|
result.truncate(end - result.constData());
|
|
return result;
|
|
}
|
|
|
|
QChar *QUtf32::convertToUnicode(QChar *out, QByteArrayView in, QStringConverter::State *state, DataEndianness endian)
|
|
{
|
|
qsizetype len = in.size();
|
|
const char *chars = in.data();
|
|
|
|
Q_ASSERT(state);
|
|
if (endian == DetectEndianness)
|
|
endian = (DataEndianness)state->state_data[Endian];
|
|
|
|
const char *end = chars + len;
|
|
|
|
uchar tuple[4];
|
|
memcpy(tuple, &state->state_data[Data], 4);
|
|
|
|
// make sure we can decode at least one char
|
|
if (state->remainingChars + len < 4) {
|
|
if (len) {
|
|
while (chars < end) {
|
|
tuple[state->remainingChars] = *chars;
|
|
++state->remainingChars;
|
|
++chars;
|
|
}
|
|
Q_ASSERT(state->remainingChars < 4);
|
|
memcpy(&state->state_data[Data], tuple, 4);
|
|
}
|
|
return out;
|
|
}
|
|
|
|
bool headerdone = state->internalState & HeaderDone;
|
|
if (state->flags & QStringConverter::Flag::ConvertInitialBom)
|
|
headerdone = true;
|
|
|
|
qsizetype num = state->remainingChars;
|
|
state->remainingChars = 0;
|
|
|
|
if (!headerdone || endian == DetectEndianness || num) {
|
|
while (num < 4)
|
|
tuple[num++] = *chars++;
|
|
if (endian == DetectEndianness) {
|
|
// someone set us up the BOM?
|
|
if (tuple[0] == 0xff && tuple[1] == 0xfe && tuple[2] == 0 && tuple[3] == 0) {
|
|
endian = LittleEndianness;
|
|
} else if (tuple[0] == 0 && tuple[1] == 0 && tuple[2] == 0xfe && tuple[3] == 0xff) {
|
|
endian = BigEndianness;
|
|
} else if (QSysInfo::ByteOrder == QSysInfo::BigEndian) {
|
|
endian = BigEndianness;
|
|
} else {
|
|
endian = LittleEndianness;
|
|
}
|
|
}
|
|
char32_t code = (endian == BigEndianness) ? qFromBigEndian<char32_t>(tuple) : qFromLittleEndian<char32_t>(tuple);
|
|
if (headerdone || code != QChar::ByteOrderMark) {
|
|
if (QChar::requiresSurrogates(code)) {
|
|
*out++ = QChar(QChar::highSurrogate(code));
|
|
*out++ = QChar(QChar::lowSurrogate(code));
|
|
} else {
|
|
*out++ = QChar(code);
|
|
}
|
|
}
|
|
num = 0;
|
|
} else if (endian == DetectEndianness) {
|
|
endian = (QSysInfo::ByteOrder == QSysInfo::BigEndian) ? BigEndianness : LittleEndianness;
|
|
}
|
|
state->state_data[Endian] = endian;
|
|
state->internalState |= HeaderDone;
|
|
|
|
while (chars < end) {
|
|
tuple[num++] = *chars++;
|
|
if (num == 4) {
|
|
char32_t code = (endian == BigEndianness) ? qFromBigEndian<char32_t>(tuple) : qFromLittleEndian<char32_t>(tuple);
|
|
for (char16_t c : QChar::fromUcs4(code))
|
|
*out++ = c;
|
|
num = 0;
|
|
}
|
|
}
|
|
|
|
if (num) {
|
|
if (state->flags & QStringDecoder::Flag::Stateless) {
|
|
*out++ = QChar::ReplacementCharacter;
|
|
} else {
|
|
state->state_data[Endian] = endian;
|
|
state->remainingChars = num;
|
|
memcpy(&state->state_data[Data], tuple, 4);
|
|
}
|
|
}
|
|
|
|
return out;
|
|
}
|
|
|
|
#if defined(Q_OS_WIN) && !defined(QT_BOOTSTRAPPED)
|
|
int QLocal8Bit::checkUtf8()
|
|
{
|
|
return GetACP() == CP_UTF8 ? 1 : -1;
|
|
}
|
|
|
|
static QString convertToUnicodeCharByChar(QByteArrayView in, QStringConverter::State *state)
|
|
{
|
|
qsizetype length = in.size();
|
|
const char *chars = in.data();
|
|
|
|
Q_ASSERT(state);
|
|
if (state->flags & QStringConverter::Flag::Stateless) // temporary
|
|
state = nullptr;
|
|
|
|
if (!chars || !length)
|
|
return QString();
|
|
|
|
qsizetype copyLocation = 0;
|
|
qsizetype extra = 2;
|
|
if (state && state->remainingChars) {
|
|
copyLocation = state->remainingChars;
|
|
extra += copyLocation;
|
|
}
|
|
qsizetype newLength = length + extra;
|
|
char *mbcs = new char[newLength];
|
|
//ensure that we have a NULL terminated string
|
|
mbcs[newLength-1] = 0;
|
|
mbcs[newLength-2] = 0;
|
|
memcpy(&(mbcs[copyLocation]), chars, length);
|
|
if (copyLocation) {
|
|
//copy the last character from the state
|
|
mbcs[0] = (char)state->state_data[0];
|
|
state->remainingChars = 0;
|
|
}
|
|
const char *mb = mbcs;
|
|
const char *next = 0;
|
|
QString s;
|
|
while ((next = CharNextExA(CP_ACP, mb, 0)) != mb) {
|
|
wchar_t wc[2] ={0};
|
|
int charlength = int(next - mb); // always just a few bytes
|
|
int len = MultiByteToWideChar(CP_ACP, MB_PRECOMPOSED|MB_ERR_INVALID_CHARS, mb, charlength, wc, 2);
|
|
if (len>0) {
|
|
s.append(QChar(wc[0]));
|
|
} else {
|
|
int r = GetLastError();
|
|
//check if the character being dropped is the last character
|
|
if (r == ERROR_NO_UNICODE_TRANSLATION && mb == (mbcs+newLength -3) && state) {
|
|
state->remainingChars = 1;
|
|
state->state_data[0] = (char)*mb;
|
|
}
|
|
}
|
|
mb = next;
|
|
}
|
|
delete [] mbcs;
|
|
return s;
|
|
}
|
|
|
|
|
|
QString QLocal8Bit::convertToUnicode_sys(QByteArrayView in, QStringConverter::State *state)
|
|
{
|
|
qsizetype length = in.size();
|
|
|
|
Q_ASSERT(length < INT_MAX); // ### FIXME
|
|
const char *mb = in.data();
|
|
int mblen = length;
|
|
|
|
if (!mb || !mblen)
|
|
return QString();
|
|
|
|
QVarLengthArray<wchar_t, 4096> wc(4096);
|
|
int len;
|
|
QString sp;
|
|
bool prepend = false;
|
|
char state_data = 0;
|
|
int remainingChars = 0;
|
|
|
|
//save the current state information
|
|
if (state) {
|
|
state_data = (char)state->state_data[0];
|
|
remainingChars = state->remainingChars;
|
|
}
|
|
|
|
//convert the pending character (if available)
|
|
if (state && remainingChars) {
|
|
char prev[3] = {0};
|
|
prev[0] = state_data;
|
|
prev[1] = mb[0];
|
|
remainingChars = 0;
|
|
len = MultiByteToWideChar(CP_ACP, MB_PRECOMPOSED,
|
|
prev, 2, wc.data(), wc.length());
|
|
if (len) {
|
|
sp.append(QChar(wc[0]));
|
|
if (mblen == 1) {
|
|
state->remainingChars = 0;
|
|
return sp;
|
|
}
|
|
prepend = true;
|
|
mb++;
|
|
mblen--;
|
|
wc[0] = 0;
|
|
}
|
|
}
|
|
|
|
while (!(len=MultiByteToWideChar(CP_ACP, MB_PRECOMPOSED|MB_ERR_INVALID_CHARS,
|
|
mb, mblen, wc.data(), wc.length()))) {
|
|
int r = GetLastError();
|
|
if (r == ERROR_INSUFFICIENT_BUFFER) {
|
|
const int wclen = MultiByteToWideChar(CP_ACP, MB_PRECOMPOSED,
|
|
mb, mblen, 0, 0);
|
|
wc.resize(wclen);
|
|
} else if (r == ERROR_NO_UNICODE_TRANSLATION) {
|
|
//find the last non NULL character
|
|
while (mblen > 1 && !(mb[mblen-1]))
|
|
mblen--;
|
|
//check whether, we hit an invalid character in the middle
|
|
if ((mblen <= 1) || (remainingChars && state_data))
|
|
return convertToUnicodeCharByChar(in, state);
|
|
//Remove the last character and try again...
|
|
state_data = mb[mblen-1];
|
|
remainingChars = 1;
|
|
mblen--;
|
|
} else {
|
|
// Fail.
|
|
qWarning("MultiByteToWideChar: Cannot convert multibyte text");
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (len <= 0)
|
|
return QString();
|
|
|
|
if (wc[len-1] == 0) // len - 1: we don't want terminator
|
|
--len;
|
|
|
|
//save the new state information
|
|
if (state) {
|
|
state->state_data[0] = (char)state_data;
|
|
state->remainingChars = remainingChars;
|
|
}
|
|
QString s((QChar*)wc.data(), len);
|
|
if (prepend) {
|
|
return sp+s;
|
|
}
|
|
return s;
|
|
}
|
|
|
|
QByteArray QLocal8Bit::convertFromUnicode_sys(QStringView in, QStringConverter::State *state)
|
|
{
|
|
const QChar *ch = in.data();
|
|
qsizetype uclen = in.size();
|
|
|
|
Q_ASSERT(uclen < INT_MAX); // ### FIXME
|
|
Q_ASSERT(state);
|
|
Q_UNUSED(state); // ### Fixme
|
|
if (state->flags & QStringConverter::Flag::Stateless) // temporary
|
|
state = nullptr;
|
|
|
|
if (!ch)
|
|
return QByteArray();
|
|
if (uclen == 0)
|
|
return QByteArray("");
|
|
BOOL used_def;
|
|
QByteArray mb(4096, 0);
|
|
int len;
|
|
while (!(len=WideCharToMultiByte(CP_ACP, 0, (const wchar_t*)ch, uclen,
|
|
mb.data(), mb.size()-1, 0, &used_def)))
|
|
{
|
|
int r = GetLastError();
|
|
if (r == ERROR_INSUFFICIENT_BUFFER) {
|
|
mb.resize(1+WideCharToMultiByte(CP_ACP, 0,
|
|
(const wchar_t*)ch, uclen,
|
|
0, 0, 0, &used_def));
|
|
// and try again...
|
|
} else {
|
|
// Fail. Probably can't happen in fact (dwFlags is 0).
|
|
#ifndef QT_NO_DEBUG
|
|
// Can't use qWarning(), as it'll recurse to handle %ls
|
|
fprintf(stderr,
|
|
"WideCharToMultiByte: Cannot convert multibyte text (error %d): %ls\n",
|
|
r, reinterpret_cast<const wchar_t*>(QString(ch, uclen).utf16()));
|
|
#endif
|
|
break;
|
|
}
|
|
}
|
|
mb.resize(len);
|
|
return mb;
|
|
}
|
|
#endif
|
|
|
|
void QStringConverter::State::clear() noexcept
|
|
{
|
|
if (clearFn)
|
|
clearFn(this);
|
|
else
|
|
state_data[0] = state_data[1] = state_data[2] = state_data[3] = 0;
|
|
remainingChars = 0;
|
|
invalidChars = 0;
|
|
internalState = 0;
|
|
}
|
|
|
|
void QStringConverter::State::reset() noexcept
|
|
{
|
|
if (flags & Flag::UsesIcu) {
|
|
#if QT_CONFIG(icu)
|
|
UConverter *converter = static_cast<UConverter *>(d[0]);
|
|
if (converter)
|
|
ucnv_reset(converter);
|
|
#else
|
|
Q_UNREACHABLE();
|
|
#endif
|
|
} else {
|
|
clear();
|
|
}
|
|
}
|
|
|
|
static QChar *fromUtf16(QChar *out, QByteArrayView in, QStringConverter::State *state)
|
|
{
|
|
return QUtf16::convertToUnicode(out, in, state, DetectEndianness);
|
|
}
|
|
|
|
static char *toUtf16(char *out, QStringView in, QStringConverter::State *state)
|
|
{
|
|
return QUtf16::convertFromUnicode(out, in, state, DetectEndianness);
|
|
}
|
|
|
|
static QChar *fromUtf16BE(QChar *out, QByteArrayView in, QStringConverter::State *state)
|
|
{
|
|
return QUtf16::convertToUnicode(out, in, state, BigEndianness);
|
|
}
|
|
|
|
static char *toUtf16BE(char *out, QStringView in, QStringConverter::State *state)
|
|
{
|
|
return QUtf16::convertFromUnicode(out, in, state, BigEndianness);
|
|
}
|
|
|
|
static QChar *fromUtf16LE(QChar *out, QByteArrayView in, QStringConverter::State *state)
|
|
{
|
|
return QUtf16::convertToUnicode(out, in, state, LittleEndianness);
|
|
}
|
|
|
|
static char *toUtf16LE(char *out, QStringView in, QStringConverter::State *state)
|
|
{
|
|
return QUtf16::convertFromUnicode(out, in, state, LittleEndianness);
|
|
}
|
|
|
|
static QChar *fromUtf32(QChar *out, QByteArrayView in, QStringConverter::State *state)
|
|
{
|
|
return QUtf32::convertToUnicode(out, in, state, DetectEndianness);
|
|
}
|
|
|
|
static char *toUtf32(char *out, QStringView in, QStringConverter::State *state)
|
|
{
|
|
return QUtf32::convertFromUnicode(out, in, state, DetectEndianness);
|
|
}
|
|
|
|
static QChar *fromUtf32BE(QChar *out, QByteArrayView in, QStringConverter::State *state)
|
|
{
|
|
return QUtf32::convertToUnicode(out, in, state, BigEndianness);
|
|
}
|
|
|
|
static char *toUtf32BE(char *out, QStringView in, QStringConverter::State *state)
|
|
{
|
|
return QUtf32::convertFromUnicode(out, in, state, BigEndianness);
|
|
}
|
|
|
|
static QChar *fromUtf32LE(QChar *out, QByteArrayView in, QStringConverter::State *state)
|
|
{
|
|
return QUtf32::convertToUnicode(out, in, state, LittleEndianness);
|
|
}
|
|
|
|
static char *toUtf32LE(char *out, QStringView in, QStringConverter::State *state)
|
|
{
|
|
return QUtf32::convertFromUnicode(out, in, state, LittleEndianness);
|
|
}
|
|
|
|
void qt_from_latin1(char16_t *dst, const char *str, size_t size) noexcept;
|
|
|
|
static QChar *fromLatin1(QChar *out, QByteArrayView in, QStringConverter::State *state)
|
|
{
|
|
Q_ASSERT(state);
|
|
Q_UNUSED(state);
|
|
|
|
qt_from_latin1(reinterpret_cast<char16_t *>(out), in.data(), size_t(in.size()));
|
|
return out + in.size();
|
|
}
|
|
|
|
|
|
static char *toLatin1(char *out, QStringView in, QStringConverter::State *state)
|
|
{
|
|
Q_ASSERT(state);
|
|
if (state->flags & QStringConverter::Flag::Stateless) // temporary
|
|
state = nullptr;
|
|
|
|
const char replacement = (state && state->flags & QStringConverter::Flag::ConvertInvalidToNull) ? 0 : '?';
|
|
qsizetype invalid = 0;
|
|
for (qsizetype i = 0; i < in.length(); ++i) {
|
|
if (in[i] > QChar(0xff)) {
|
|
*out = replacement;
|
|
++invalid;
|
|
} else {
|
|
*out = (char)in[i].cell();
|
|
}
|
|
++out;
|
|
}
|
|
if (state)
|
|
state->invalidChars += invalid;
|
|
return out;
|
|
}
|
|
|
|
static QChar *fromLocal8Bit(QChar *out, QByteArrayView in, QStringConverter::State *state)
|
|
{
|
|
QString s = QLocal8Bit::convertToUnicode(in, state);
|
|
memcpy(out, s.constData(), s.size()*sizeof(QChar));
|
|
return out + s.size();
|
|
}
|
|
|
|
static char *toLocal8Bit(char *out, QStringView in, QStringConverter::State *state)
|
|
{
|
|
QByteArray s = QLocal8Bit::convertFromUnicode(in, state);
|
|
memcpy(out, s.constData(), s.size());
|
|
return out + s.size();
|
|
}
|
|
|
|
|
|
static qsizetype fromUtf8Len(qsizetype l) { return l + 1; }
|
|
static qsizetype toUtf8Len(qsizetype l) { return 3*(l + 1); }
|
|
|
|
static qsizetype fromUtf16Len(qsizetype l) { return l/2 + 2; }
|
|
static qsizetype toUtf16Len(qsizetype l) { return 2*(l + 1); }
|
|
|
|
static qsizetype fromUtf32Len(qsizetype l) { return l/2 + 2; }
|
|
static qsizetype toUtf32Len(qsizetype l) { return 4*(l + 1); }
|
|
|
|
static qsizetype fromLatin1Len(qsizetype l) { return l + 1; }
|
|
static qsizetype toLatin1Len(qsizetype l) { return l + 1; }
|
|
|
|
|
|
|
|
/*!
|
|
\class QStringConverterBase
|
|
\internal
|
|
|
|
Just a common base class for QStringConverter and QTextCodec
|
|
*/
|
|
|
|
/*!
|
|
\class QStringConverter
|
|
\inmodule QtCore
|
|
\brief The QStringConverter class provides a base class for encoding and decoding text.
|
|
\reentrant
|
|
\ingroup i18n
|
|
|
|
Qt uses UTF-16 to store, draw and manipulate strings. In many
|
|
situations you may wish to deal with data that uses a different
|
|
encoding. Most text data transferred over files and network connections is encoded
|
|
in UTF-8.
|
|
|
|
The QStringConverter class is a base class for the \l {QStringEncoder} and
|
|
\l {QStringDecoder} classes that help with converting between different
|
|
text encodings. QStringDecoder can decode a string from an encoded representation
|
|
into UTF-16, the format Qt uses internally. QStringEncoder does the opposite
|
|
operation, encoding UTF-16 encoded data (usually in the form of a QString) to
|
|
the requested encoding.
|
|
|
|
The supported encodings are:
|
|
|
|
\list
|
|
\li UTF-8
|
|
\li UTF-16
|
|
\li UTF-16BE
|
|
\li UTF-16LE
|
|
\li UTF-32
|
|
\li UTF-32BE
|
|
\li UTF-32LE
|
|
\li ISO-8859-1 (Latin-1)
|
|
\li The system encoding
|
|
\endlist
|
|
|
|
\l {QStringConverter}s can be used as follows to convert some encoded
|
|
string to and from UTF-16.
|
|
|
|
Suppose you have some string encoded in UTF-8, and
|
|
want to convert it to a QString. The simple way
|
|
to do it is to use a \l {QStringDecoder} like this:
|
|
|
|
\snippet code/src_corelib_text_qstringconverter.cpp 0
|
|
|
|
After this, \c string holds the text in decoded form.
|
|
Converting a string from Unicode to the local encoding is just as
|
|
easy using the \l {QStringEncoder} class:
|
|
|
|
\snippet code/src_corelib_text_qstringconverter.cpp 1
|
|
|
|
To read or write text files in various encodings, use QTextStream and
|
|
its \l{QTextStream::setEncoding()}{setEncoding()} function.
|
|
|
|
Some care must be taken when trying to convert the data in chunks,
|
|
for example, when receiving it over a network. In such cases it is
|
|
possible that a multi-byte character will be split over two
|
|
chunks. At best this might result in the loss of a character and
|
|
at worst cause the entire conversion to fail.
|
|
|
|
Both QStringEncoder and QStringDecoder make this easy, by tracking
|
|
this in an internal state. So simply calling the encoder or decoder
|
|
again with the next chunk of data will automatically continue encoding
|
|
or decoding the data correctly:
|
|
|
|
\snippet code/src_corelib_text_qstringconverter.cpp 2
|
|
|
|
The QStringDecoder object maintains state between chunks and therefore
|
|
works correctly even if a multi-byte character is split between
|
|
chunks.
|
|
|
|
QStringConverter objects can't be copied because of their internal state, but
|
|
can be moved.
|
|
|
|
\sa QTextStream, QStringDecoder, QStringEncoder
|
|
*/
|
|
|
|
/*!
|
|
\enum QStringConverter::Flag
|
|
|
|
\value Default Default conversion rules apply.
|
|
\value ConvertInvalidToNull If this flag is set, each invalid input
|
|
character is output as a null character. If it is not set,
|
|
invalid input characters are represented as QChar::ReplacementCharacter
|
|
if the output encoding can represent that character, otherwise as a question mark.
|
|
\value WriteBom When converting from a QString to an output encoding, write a QChar::ByteOrderMark as the first
|
|
character if the output encoding supports this. This is the case for UTF-8, UTF-16 and UTF-32
|
|
encodings.
|
|
\value ConvertInitialBom When converting from an input encoding to a QString the QStringDecoder usually skips an
|
|
leading QChar::ByteOrderMark. When this flag is set, the byte order mark will not be
|
|
skipped, but converted to utf-16 and inserted at the start of the created QString.
|
|
\value Stateless Ignore possible converter states between different function calls
|
|
to encode or decode strings. This will also cause the QStringConverter to raise an error if an incomplete
|
|
sequence of data is encountered.
|
|
\omitvalue UsesIcu
|
|
*/
|
|
|
|
/*!
|
|
\enum QStringConverter::Encoding
|
|
\value Utf8 Create a converter to or from UTF-8
|
|
\value Utf16 Create a converter to or from UTF-16. When decoding, the byte order will get automatically
|
|
detected by a leading byte order mark. If none exists or when encoding, the system byte order will
|
|
be assumed.
|
|
\value Utf16BE Create a converter to or from big-endian UTF-16.
|
|
\value Utf16LE Create a converter to or from little-endian UTF-16.
|
|
\value Utf32 Create a converter to or from UTF-32. When decoding, the byte order will get automatically
|
|
detected by a leading byte order mark. If none exists or when encoding, the system byte order will
|
|
be assumed.
|
|
\value Utf32BE Create a converter to or from big-endian UTF-32.
|
|
\value Utf32LE Create a converter to or from little-endian UTF-32.
|
|
\value Latin1 Create a converter to or from ISO-8859-1 (Latin1).
|
|
\value System Create a converter to or from the underlying encoding of the
|
|
operating systems locale. This is always assumed to be UTF-8 for Unix based
|
|
systems. On Windows, this converts to and from the locale code page.
|
|
\omitvalue LastEncoding
|
|
*/
|
|
|
|
/*!
|
|
\struct QStringConverter::Interface
|
|
\internal
|
|
*/
|
|
|
|
const QStringConverter::Interface QStringConverter::encodingInterfaces[QStringConverter::LastEncoding + 1] =
|
|
{
|
|
{ "UTF-8", QUtf8::convertToUnicode, fromUtf8Len, QUtf8::convertFromUnicode, toUtf8Len },
|
|
{ "UTF-16", fromUtf16, fromUtf16Len, toUtf16, toUtf16Len },
|
|
{ "UTF-16LE", fromUtf16LE, fromUtf16Len, toUtf16LE, toUtf16Len },
|
|
{ "UTF-16BE", fromUtf16BE, fromUtf16Len, toUtf16BE, toUtf16Len },
|
|
{ "UTF-32", fromUtf32, fromUtf32Len, toUtf32, toUtf32Len },
|
|
{ "UTF-32LE", fromUtf32LE, fromUtf32Len, toUtf32LE, toUtf32Len },
|
|
{ "UTF-32BE", fromUtf32BE, fromUtf32Len, toUtf32BE, toUtf32Len },
|
|
{ "ISO-8859-1", fromLatin1, fromLatin1Len, toLatin1, toLatin1Len },
|
|
{ "Locale", fromLocal8Bit, fromUtf8Len, toLocal8Bit, toUtf8Len }
|
|
};
|
|
|
|
// match names case insensitive and skipping '-' and '_'
|
|
static bool nameMatch(const char *a, const char *b)
|
|
{
|
|
while (*a && *b) {
|
|
if (*a == '-' || *a == '_') {
|
|
++a;
|
|
continue;
|
|
}
|
|
if (*b == '-' || *b == '_') {
|
|
++b;
|
|
continue;
|
|
}
|
|
if (QtMiscUtils::toAsciiLower(*a) != QtMiscUtils::toAsciiLower(*b))
|
|
return false;
|
|
++a;
|
|
++b;
|
|
}
|
|
return !*a && !*b;
|
|
}
|
|
|
|
|
|
/*!
|
|
\fn constexpr QStringConverter::QStringConverter()
|
|
\internal
|
|
*/
|
|
|
|
/*!
|
|
\fn constexpr QStringConverter::QStringConverter(Encoding, Flags)
|
|
\internal
|
|
*/
|
|
|
|
|
|
#if QT_CONFIG(icu)
|
|
// only derives from QStringConverter to get access to protected types
|
|
struct QStringConverterICU : QStringConverter
|
|
{
|
|
static void clear_function(QStringConverterBase::State *state) noexcept
|
|
{
|
|
ucnv_close(static_cast<UConverter *>(state->d[0]));
|
|
state->d[0] = nullptr;
|
|
}
|
|
|
|
static void ensureConverter(QStringConverter::State *state)
|
|
{
|
|
// old code might reset the state via clear instead of reset
|
|
// in that case, the converter has been closed, and we have to reopen it
|
|
if (state->d[0] == nullptr)
|
|
state->d[0] = createConverterForName(static_cast<const char *>(state->d[1]), state);
|
|
}
|
|
|
|
static QChar *toUtf16(QChar *out, QByteArrayView in, QStringConverter::State *state)
|
|
{
|
|
ensureConverter(state);
|
|
|
|
auto icu_conv = static_cast<UConverter *>(state->d[0]);
|
|
UErrorCode err = U_ZERO_ERROR;
|
|
auto source = in.data();
|
|
auto sourceLimit = in.data() + in.size();
|
|
|
|
qsizetype length = toLen(in.size());
|
|
|
|
UChar *target = reinterpret_cast<UChar *>(out);
|
|
auto targetLimit = target + length;
|
|
// We explicitly clean up anyway, so no need to set flush to true,
|
|
// which would just reset the converter.
|
|
UBool flush = false;
|
|
|
|
// If the QStringConverter was moved, the state that we used as a context is stale now.
|
|
UConverterToUCallback action;
|
|
const void *context;
|
|
ucnv_getToUCallBack(icu_conv, &action, &context);
|
|
if (context != state)
|
|
ucnv_setToUCallBack(icu_conv, action, &state, nullptr, nullptr, &err);
|
|
|
|
ucnv_toUnicode(icu_conv, &target, targetLimit, &source, sourceLimit, nullptr, flush, &err);
|
|
// We did reserve enough space:
|
|
Q_ASSERT(err != U_BUFFER_OVERFLOW_ERROR);
|
|
if (state->flags.testFlag(QStringConverter::Flag::Stateless)) {
|
|
if (auto leftOver = ucnv_toUCountPending(icu_conv, &err)) {
|
|
ucnv_reset(icu_conv);
|
|
state->invalidChars += leftOver;
|
|
}
|
|
}
|
|
return reinterpret_cast<QChar *>(target);
|
|
}
|
|
|
|
static char *fromUtf16(char *out, QStringView in, QStringConverter::State *state)
|
|
{
|
|
ensureConverter(state);
|
|
auto icu_conv = static_cast<UConverter *>(state->d[0]);
|
|
UErrorCode err = U_ZERO_ERROR;
|
|
auto source = reinterpret_cast<const UChar *>(in.data());
|
|
auto sourceLimit = reinterpret_cast<const UChar *>(in.data() + in.size());
|
|
|
|
qsizetype length = UCNV_GET_MAX_BYTES_FOR_STRING(in.length(), ucnv_getMaxCharSize(icu_conv));
|
|
|
|
char *target = out;
|
|
char *targetLimit = out + length;
|
|
UBool flush = false;
|
|
|
|
// If the QStringConverter was moved, the state that we used as a context is stale now.
|
|
UConverterFromUCallback action;
|
|
const void *context;
|
|
ucnv_getFromUCallBack(icu_conv, &action, &context);
|
|
if (context != state)
|
|
ucnv_setFromUCallBack(icu_conv, action, &state, nullptr, nullptr, &err);
|
|
|
|
ucnv_fromUnicode(icu_conv, &target, targetLimit, &source, sourceLimit, nullptr, flush, &err);
|
|
// We did reserve enough space:
|
|
Q_ASSERT(err != U_BUFFER_OVERFLOW_ERROR);
|
|
if (state->flags.testFlag(QStringConverter::Flag::Stateless)) {
|
|
if (auto leftOver = ucnv_fromUCountPending(icu_conv, &err)) {
|
|
ucnv_reset(icu_conv);
|
|
state->invalidChars += leftOver;
|
|
}
|
|
}
|
|
return target;
|
|
}
|
|
|
|
Q_DISABLE_COPY_MOVE(QStringConverterICU)
|
|
|
|
template<qsizetype X>
|
|
static qsizetype fromLen(qsizetype inLength)
|
|
{
|
|
return X * inLength * sizeof(UChar);
|
|
}
|
|
|
|
static qsizetype toLen(qsizetype inLength)
|
|
{
|
|
|
|
/* Assumption: each input char might map to a different codepoint
|
|
Each codepoint can take up to 4 bytes == 2 QChar
|
|
We can ignore reserving space for a BOM, as only UTF encodings use one
|
|
and those are not handled by the ICU converter.
|
|
*/
|
|
return 2 * inLength;
|
|
}
|
|
|
|
static constexpr QStringConverter::Interface forLength[] = {
|
|
{"icu, recompile if you see this", QStringConverterICU::toUtf16, QStringConverterICU::toLen, QStringConverterICU::fromUtf16, QStringConverterICU::fromLen<1>},
|
|
{"icu, recompile if you see this", QStringConverterICU::toUtf16, QStringConverterICU::toLen, QStringConverterICU::fromUtf16, QStringConverterICU::fromLen<2>},
|
|
{"icu, recompile if you see this", QStringConverterICU::toUtf16, QStringConverterICU::toLen, QStringConverterICU::fromUtf16, QStringConverterICU::fromLen<3>},
|
|
{"icu, recompile if you see this", QStringConverterICU::toUtf16, QStringConverterICU::toLen, QStringConverterICU::fromUtf16, QStringConverterICU::fromLen<4>},
|
|
{"icu, recompile if you see this", QStringConverterICU::toUtf16, QStringConverterICU::toLen, QStringConverterICU::fromUtf16, QStringConverterICU::fromLen<5>},
|
|
{"icu, recompile if you see this", QStringConverterICU::toUtf16, QStringConverterICU::toLen, QStringConverterICU::fromUtf16, QStringConverterICU::fromLen<6>},
|
|
{"icu, recompile if you see this", QStringConverterICU::toUtf16, QStringConverterICU::toLen, QStringConverterICU::fromUtf16, QStringConverterICU::fromLen<7>},
|
|
{"icu, recompile if you see this", QStringConverterICU::toUtf16, QStringConverterICU::toLen, QStringConverterICU::fromUtf16, QStringConverterICU::fromLen<8>}
|
|
};
|
|
|
|
static UConverter *createConverterForName(const char *name, const State *state)
|
|
{
|
|
Q_ASSERT(name);
|
|
Q_ASSERT(state);
|
|
UErrorCode status = U_ZERO_ERROR;
|
|
UConverter *conv = ucnv_open(name, &status);
|
|
if (status != U_ZERO_ERROR && status != U_AMBIGUOUS_ALIAS_WARNING) {
|
|
ucnv_close(conv);
|
|
return nullptr;
|
|
}
|
|
|
|
if (state->flags.testFlag(Flag::ConvertInvalidToNull)) {
|
|
UErrorCode error = U_ZERO_ERROR;
|
|
|
|
auto nullToSubstituter = [](const void *context, UConverterToUnicodeArgs *toUArgs,
|
|
const char *, int32_t length,
|
|
UConverterCallbackReason reason, UErrorCode *err) {
|
|
if (reason <= UCNV_IRREGULAR) {
|
|
*err = U_ZERO_ERROR;
|
|
UChar c = '\0';
|
|
ucnv_cbToUWriteUChars(toUArgs, &c, 1, 0, err);
|
|
// Recover outer scope's state (which isn't const) from context:
|
|
auto state = const_cast<State *>(static_cast<const State *>(context));
|
|
state->invalidChars += length;
|
|
}
|
|
};
|
|
ucnv_setToUCallBack(conv, nullToSubstituter, state, nullptr, nullptr, &error);
|
|
|
|
auto nullFromSubstituter = [](const void *context, UConverterFromUnicodeArgs *fromUArgs,
|
|
const UChar *, int32_t length,
|
|
UChar32, UConverterCallbackReason reason, UErrorCode *err) {
|
|
if (reason <= UCNV_IRREGULAR) {
|
|
*err = U_ZERO_ERROR;
|
|
const UChar replacement[] = { 0 };
|
|
const UChar *stringBegin = std::begin(replacement);
|
|
ucnv_cbFromUWriteUChars(fromUArgs, &stringBegin, std::end(replacement), 0, err);
|
|
// Recover outer scope's state (which isn't const) from context:
|
|
auto state = const_cast<State *>(static_cast<const State *>(context));
|
|
state->invalidChars += length;
|
|
}
|
|
};
|
|
ucnv_setFromUCallBack(conv, nullFromSubstituter, state, nullptr, nullptr, &error);
|
|
} else {
|
|
UErrorCode error = U_ZERO_ERROR;
|
|
|
|
auto qmarkToSubstituter = [](const void *context, UConverterToUnicodeArgs *toUArgs,
|
|
const char *codeUnits,int32_t length,
|
|
UConverterCallbackReason reason, UErrorCode *err) {
|
|
if (reason <= UCNV_IRREGULAR) {
|
|
// Recover outer scope's state (which isn't const) from context:
|
|
auto state = const_cast<State *>(static_cast<const State *>(context));
|
|
state->invalidChars += length;
|
|
}
|
|
// use existing ICU callback for logic
|
|
UCNV_TO_U_CALLBACK_SUBSTITUTE(nullptr, toUArgs, codeUnits, length, reason, err);
|
|
|
|
};
|
|
ucnv_setToUCallBack(conv, qmarkToSubstituter, state, nullptr, nullptr, &error);
|
|
|
|
auto qmarkFromSubstituter = [](const void *context, UConverterFromUnicodeArgs *fromUArgs,
|
|
const UChar *codeUnits, int32_t length,
|
|
UChar32 codePoint, UConverterCallbackReason reason, UErrorCode *err) {
|
|
if (reason <= UCNV_IRREGULAR) {
|
|
// Recover outer scope's state (which isn't const) from context:
|
|
auto state = const_cast<State *>(static_cast<const State *>(context));
|
|
state->invalidChars += length;
|
|
}
|
|
// use existing ICU callback for logic
|
|
UCNV_FROM_U_CALLBACK_SUBSTITUTE(nullptr, fromUArgs, codeUnits, length,
|
|
codePoint, reason, err);
|
|
};
|
|
ucnv_setFromUCallBack(conv, qmarkFromSubstituter, state, nullptr, nullptr, &error);
|
|
}
|
|
return conv;
|
|
}
|
|
|
|
static const QStringConverter::Interface *make_icu_converter(
|
|
QStringConverterBase::State *state,
|
|
const char *name)
|
|
{
|
|
UErrorCode status = U_ZERO_ERROR;
|
|
UConverter *conv = createConverterForName(name, state);
|
|
if (!conv)
|
|
return nullptr;
|
|
|
|
const char *icuName = ucnv_getName(conv, &status);
|
|
// ucnv_getStandardName returns a name which is owned by the library
|
|
// we can thus store it in the state without worrying aobut its lifetime
|
|
const char *persistentName = ucnv_getStandardName(icuName, "MIME", &status);
|
|
if (U_FAILURE(status) || !persistentName) {
|
|
status = U_ZERO_ERROR;
|
|
persistentName = ucnv_getStandardName(icuName, "IANA", &status);
|
|
}
|
|
state->d[1] = const_cast<char *>(persistentName);
|
|
state->d[0] = conv;
|
|
state->flags |= QStringConverterBase::Flag::UsesIcu;
|
|
qsizetype maxCharSize = ucnv_getMaxCharSize(conv);
|
|
state->clearFn = QStringConverterICU::clear_function;
|
|
if (maxCharSize > 8 || maxCharSize < 1) {
|
|
qWarning("Encountered unexpected codec \"%s\" which requires >8x space", name);
|
|
return nullptr;
|
|
} else {
|
|
return &forLength[maxCharSize - 1];
|
|
}
|
|
|
|
}
|
|
|
|
};
|
|
#endif
|
|
|
|
/*!
|
|
\internal
|
|
*/
|
|
QStringConverter::QStringConverter(const char *name, Flags f)
|
|
: iface(nullptr), state(f)
|
|
{
|
|
auto e = encodingForName(name);
|
|
if (e)
|
|
iface = encodingInterfaces + int(*e);
|
|
#if QT_CONFIG(icu)
|
|
else
|
|
iface = QStringConverterICU::make_icu_converter(&state, name);
|
|
#endif
|
|
}
|
|
|
|
|
|
const char *QStringConverter::name() const noexcept
|
|
{
|
|
if (!iface)
|
|
return nullptr;
|
|
if (state.flags & QStringConverter::Flag::UsesIcu) {
|
|
#if QT_CONFIG(icu)
|
|
return static_cast<const char*>(state.d[1]);
|
|
#else
|
|
return nullptr;
|
|
#endif
|
|
} else {
|
|
return iface->name;
|
|
}
|
|
}
|
|
|
|
/*!
|
|
\fn bool QStringConverter::isValid() const
|
|
|
|
Returns true if this is a valid string converter that can be used for encoding or
|
|
decoding text.
|
|
|
|
Default constructed string converters or converters constructed with an unsupported
|
|
name are not valid.
|
|
*/
|
|
|
|
/*!
|
|
\fn void QStringConverter::resetState()
|
|
|
|
Resets the internal state of the converter, clearing potential errors or partial
|
|
conversions.
|
|
*/
|
|
|
|
/*!
|
|
\fn bool QStringConverter::hasError() const
|
|
|
|
Returns true if a conversion could not correctly convert a character. This could for example
|
|
get triggered by an invalid UTF-8 sequence or when a character can't get converted due to
|
|
limitations in the target encoding.
|
|
*/
|
|
|
|
/*!
|
|
\fn const char *QStringConverter::name() const
|
|
|
|
Returns the canonical name of the encoding this QStringConverter can encode or decode.
|
|
Returns a nullptr if the converter is not valid.
|
|
|
|
\sa isValid()
|
|
*/
|
|
|
|
/*!
|
|
Convert \a name to the corresponding \l Encoding member, if there is one.
|
|
|
|
If the \a name is not the name of a codec listed in the Encoding enumeration,
|
|
\c{std::nullopt} is returned. Such a name may, none the less, be accepted by
|
|
the QStringConverter constructor when Qt is built with ICU, if ICU provides a
|
|
converter with the given name.
|
|
*/
|
|
std::optional<QStringConverter::Encoding> QStringConverter::encodingForName(const char *name) noexcept
|
|
{
|
|
for (qsizetype i = 0; i < LastEncoding + 1; ++i) {
|
|
if (nameMatch(encodingInterfaces[i].name, name))
|
|
return QStringConverter::Encoding(i);
|
|
}
|
|
if (nameMatch(name, "latin1"))
|
|
return QStringConverter::Latin1;
|
|
return std::nullopt;
|
|
}
|
|
|
|
/*!
|
|
Returns the encoding for the content of \a data if it can be determined.
|
|
\a expectedFirstCharacter can be passed as an additional hint to help determine
|
|
the encoding.
|
|
|
|
The returned optional is empty, if the encoding is unclear.
|
|
*/
|
|
std::optional<QStringConverter::Encoding>
|
|
QStringConverter::encodingForData(QByteArrayView data, char16_t expectedFirstCharacter) noexcept
|
|
{
|
|
// someone set us up the BOM?
|
|
qsizetype arraySize = data.size();
|
|
if (arraySize > 3) {
|
|
char32_t uc = qFromUnaligned<char32_t>(data.data());
|
|
if (uc == qToBigEndian(char32_t(QChar::ByteOrderMark)))
|
|
return QStringConverter::Utf32BE;
|
|
if (uc == qToLittleEndian(char32_t(QChar::ByteOrderMark)))
|
|
return QStringConverter::Utf32LE;
|
|
if (expectedFirstCharacter) {
|
|
// catch also anything starting with the expected character
|
|
if (qToLittleEndian(uc) == expectedFirstCharacter)
|
|
return QStringConverter::Utf32LE;
|
|
else if (qToBigEndian(uc) == expectedFirstCharacter)
|
|
return QStringConverter::Utf32BE;
|
|
}
|
|
}
|
|
|
|
if (arraySize > 2) {
|
|
if (memcmp(data.data(), utf8bom, sizeof(utf8bom)) == 0)
|
|
return QStringConverter::Utf8;
|
|
}
|
|
|
|
if (arraySize > 1) {
|
|
char16_t uc = qFromUnaligned<char16_t>(data.data());
|
|
if (uc == qToBigEndian(char16_t(QChar::ByteOrderMark)))
|
|
return QStringConverter::Utf16BE;
|
|
if (uc == qToLittleEndian(char16_t(QChar::ByteOrderMark)))
|
|
return QStringConverter::Utf16LE;
|
|
if (expectedFirstCharacter) {
|
|
// catch also anything starting with the expected character
|
|
if (qToLittleEndian(uc) == expectedFirstCharacter)
|
|
return QStringConverter::Utf16LE;
|
|
else if (qToBigEndian(uc) == expectedFirstCharacter)
|
|
return QStringConverter::Utf16BE;
|
|
}
|
|
}
|
|
return std::nullopt;
|
|
}
|
|
|
|
static QByteArray parseHtmlMetaForEncoding(QByteArrayView data)
|
|
{
|
|
static constexpr auto metaSearcher = qMakeStaticByteArrayMatcher("meta ");
|
|
static constexpr auto charsetSearcher = qMakeStaticByteArrayMatcher("charset=");
|
|
|
|
QByteArray header = data.first(qMin(data.size(), qsizetype(1024))).toByteArray().toLower();
|
|
qsizetype pos = metaSearcher.indexIn(header);
|
|
if (pos != -1) {
|
|
pos = charsetSearcher.indexIn(header, pos);
|
|
if (pos != -1) {
|
|
pos += qstrlen("charset=");
|
|
if (pos < header.size() && (header.at(pos) == '\"' || header.at(pos) == '\''))
|
|
++pos;
|
|
|
|
qsizetype pos2 = pos;
|
|
// The attribute can be closed with either """, "'", ">" or "/",
|
|
// none of which are valid charset characters.
|
|
while (++pos2 < header.size()) {
|
|
char ch = header.at(pos2);
|
|
if (ch == '\"' || ch == '\'' || ch == '>' || ch == '/') {
|
|
QByteArray name = header.mid(pos, pos2 - pos);
|
|
qsizetype colon = name.indexOf(':');
|
|
if (colon > 0)
|
|
name = name.left(colon);
|
|
name = name.simplified();
|
|
if (name == "unicode") // QTBUG-41998, ICU will return UTF-16.
|
|
name = QByteArrayLiteral("UTF-8");
|
|
if (!name.isEmpty())
|
|
return name;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
return QByteArray();
|
|
}
|
|
|
|
/*!
|
|
Tries to determine the encoding of the HTML in \a data by looking at leading byte
|
|
order marks or a charset specifier in the HTML meta tag. If the optional is empty,
|
|
the encoding specified is not supported by QStringConverter. If no encoding is
|
|
detected, the method returns Utf8.
|
|
|
|
\sa QStringDecoder::decoderForHtml()
|
|
*/
|
|
std::optional<QStringConverter::Encoding> QStringConverter::encodingForHtml(QByteArrayView data)
|
|
{
|
|
// determine charset
|
|
std::optional<QStringConverter::Encoding> encoding = encodingForData(data);
|
|
if (encoding)
|
|
// trust the initial BOM
|
|
return encoding;
|
|
|
|
QByteArray encodingTag = parseHtmlMetaForEncoding(data);
|
|
if (!encodingTag.isEmpty())
|
|
return encodingForName(encodingTag);
|
|
|
|
return Utf8;
|
|
}
|
|
|
|
/*!
|
|
Tries to determine the encoding of the HTML in \a data by looking at leading byte
|
|
order marks or a charset specifier in the HTML meta tag and returns a QStringDecoder
|
|
matching the encoding. If the returned decoder is not valid,
|
|
the encoding specified is not supported by QStringConverter. If no encoding is
|
|
detected, the method returns a decoder for Utf8.
|
|
|
|
\sa isValid()
|
|
*/
|
|
QStringDecoder QStringDecoder::decoderForHtml(QByteArrayView data)
|
|
{
|
|
// determine charset
|
|
std::optional<QStringConverter::Encoding> encoding = encodingForData(data);
|
|
if (encoding)
|
|
// trust the initial BOM
|
|
return QStringDecoder(encoding.value());
|
|
|
|
QByteArray encodingTag = parseHtmlMetaForEncoding(data);
|
|
if (!encodingTag.isEmpty())
|
|
return QStringDecoder(encodingTag);
|
|
|
|
return QStringDecoder(Utf8);
|
|
}
|
|
|
|
|
|
/*!
|
|
Returns the canonical name for encoding \a e.
|
|
*/
|
|
const char *QStringConverter::nameForEncoding(QStringConverter::Encoding e)
|
|
{
|
|
return encodingInterfaces[int(e)].name;
|
|
}
|
|
|
|
/*!
|
|
\class QStringEncoder
|
|
\inmodule QtCore
|
|
\brief The QStringEncoder class provides a state-based encoder for text.
|
|
\reentrant
|
|
\ingroup i18n
|
|
|
|
A text encoder converts text from Qt's internal representation into an encoded
|
|
text format using a specific encoding.
|
|
|
|
Converting a string from Unicode to the local encoding can be achieved
|
|
using the following code:
|
|
|
|
\snippet code/src_corelib_text_qstringconverter.cpp 1
|
|
|
|
The encoder remembers any state that is required between calls, so converting
|
|
data received in chunks, for example, when receiving it over a network, is just as
|
|
easy, by calling the encoder whenever new data is available:
|
|
|
|
\snippet code/src_corelib_text_qstringconverter.cpp 3
|
|
|
|
The QStringEncoder object maintains state between chunks and therefore
|
|
works correctly even if a UTF-16 surrogate character is split between
|
|
chunks.
|
|
|
|
QStringEncoder objects can't be copied because of their internal state, but
|
|
can be moved.
|
|
|
|
\sa QStringConverter, QStringDecoder
|
|
*/
|
|
|
|
/*!
|
|
\fn constexpr QStringEncoder::QStringEncoder(const Interface *i)
|
|
\internal
|
|
*/
|
|
|
|
/*!
|
|
\fn constexpr QStringEncoder::QStringEncoder()
|
|
|
|
Default constructs an encoder. The default encoder is not valid,
|
|
and can't be used for converting text.
|
|
*/
|
|
|
|
/*!
|
|
\fn constexpr QStringEncoder::QStringEncoder(Encoding encoding, Flags flags = Flag::Default)
|
|
|
|
Creates an encoder object using \a encoding and \a flags.
|
|
*/
|
|
|
|
/*!
|
|
\fn constexpr QStringEncoder::QStringEncoder(const char *name, Flags flags = Flag::Default)
|
|
|
|
Creates an encoder object using \a name and \a flags.
|
|
If \a name is not the name of a known encoding an invalid converter will get created.
|
|
|
|
\sa isValid()
|
|
*/
|
|
|
|
/*!
|
|
\fn QByteArray QStringEncoder::encode(const QString &in)
|
|
\fn QByteArray QStringEncoder::encode(QStringView in)
|
|
\fn QByteArray QStringEncoder::operator()(const QString &in)
|
|
\fn QByteArray QStringEncoder::operator()(QStringView in)
|
|
|
|
Converts \a in and returns the data as a byte array.
|
|
*/
|
|
|
|
/*!
|
|
\fn qsizetype QStringEncoder::requiredSpace(qsizetype inputLength) const
|
|
|
|
Returns the maximum amount of characters required to be able to process
|
|
\a inputLength decoded data.
|
|
|
|
\sa appendToBuffer()
|
|
*/
|
|
|
|
/*!
|
|
\fn char *QStringEncoder::appendToBuffer(char *out, QStringView in)
|
|
|
|
Encodes \a in and writes the encoded result into the buffer
|
|
starting at \a out. Returns a pointer to the end of the data written.
|
|
|
|
\note \a out must be large enough to be able to hold all the decoded data. Use
|
|
requiredSpace() to determine the maximum size requirement to be able to encode
|
|
\a in.
|
|
|
|
\sa requiredSpace()
|
|
*/
|
|
|
|
/*!
|
|
\class QStringDecoder
|
|
\inmodule QtCore
|
|
\brief The QStringDecoder class provides a state-based decoder for text.
|
|
\reentrant
|
|
\ingroup i18n
|
|
|
|
A text decoder converts text an encoded text format that uses a specific encoding
|
|
into Qt's internal representation.
|
|
|
|
Converting encoded data into a QString can be achieved
|
|
using the following code:
|
|
|
|
\snippet code/src_corelib_text_qstringconverter.cpp 0
|
|
|
|
The decoder remembers any state that is required between calls, so converting
|
|
data received in chunks, for example, when receiving it over a network, is just as
|
|
easy, by calling the decoder whenever new data is available:
|
|
|
|
\snippet code/src_corelib_text_qstringconverter.cpp 2
|
|
|
|
The QStringDecoder object maintains state between chunks and therefore
|
|
works correctly even if chunks are split in the middle of a multi-byte character
|
|
sequence.
|
|
|
|
QStringDecoder objects can't be copied because of their internal state, but
|
|
can be moved.
|
|
|
|
\sa QStringConverter, QStringEncoder
|
|
*/
|
|
|
|
/*!
|
|
\fn constexpr QStringDecoder::QStringDecoder(const Interface *i)
|
|
\internal
|
|
*/
|
|
|
|
/*!
|
|
\fn constexpr QStringDecoder::QStringDecoder()
|
|
|
|
Default constructs an decoder. The default decoder is not valid,
|
|
and can't be used for converting text.
|
|
*/
|
|
|
|
/*!
|
|
\fn constexpr QStringDecoder::QStringDecoder(Encoding encoding, Flags flags = Flag::Default)
|
|
|
|
Creates an decoder object using \a encoding and \a flags.
|
|
*/
|
|
|
|
/*!
|
|
\fn constexpr QStringDecoder::QStringDecoder(const char *name, Flags flags = Flag::Default)
|
|
|
|
Creates an decoder object using \a name and \a flags.
|
|
If \a name is not the name of a known encoding an invalid converter will get created.
|
|
|
|
\sa isValid()
|
|
*/
|
|
|
|
/*!
|
|
\fn QString QStringDecoder::operator()(const QByteArray &ba)
|
|
\fn QString QStringDecoder::decode(const QByteArray &ba)
|
|
\fn QString QStringDecoder::operator()(QByteArrayView ba)
|
|
\fn QString QStringDecoder::decode(QByteArrayView ba)
|
|
|
|
Converts \a ba and returns the data as a QString.
|
|
*/
|
|
|
|
/*!
|
|
\fn qsizetype QStringDecoder::requiredSpace(qsizetype inputLength) const
|
|
|
|
Returns the maximum amount of UTF-16 code units required to be able to process
|
|
\a inputLength encoded data.
|
|
|
|
\sa appendToBuffer
|
|
*/
|
|
|
|
/*!
|
|
\fn QChar *QStringDecoder::appendToBuffer(QChar *out, QByteArrayView in)
|
|
|
|
Decodes the sequence of bytes viewed by \a in and writes the decoded result into
|
|
the buffer starting at \a out. Returns a pointer to the end of data written.
|
|
|
|
\a out needs to be large enough to be able to hold all the decoded data. Use
|
|
\l{requiredSpace} to determine the maximum size requirements to decode an encoded
|
|
data buffer of \c in.size() bytes.
|
|
|
|
\sa requiredSpace
|
|
*/
|
|
|
|
QT_END_NAMESPACE
|