QUtf8: merge the convert{To,From}Unicode main bodies
Instead of duplicating the looping, centralize in a template function that calls out to the lambda to handle the error condition and thus store in the state. Change-Id: If2730356303f721bc3bbfffd2640666da8d65f1d Reviewed-by: Allan Sandfeld Jensen <allan.jensen@qt.io>
This commit is contained in:
parent
2553d9709f
commit
2fcb905fef
@ -472,13 +472,12 @@ static void simdCompareAscii(const qchar8_t *&, const qchar8_t *, const char16_t
|
|||||||
|
|
||||||
enum { HeaderDone = 1 };
|
enum { HeaderDone = 1 };
|
||||||
|
|
||||||
QByteArray QUtf8::convertFromUnicode(QStringView in)
|
template <typename OnErrorLambda> Q_ALWAYS_INLINE
|
||||||
|
char *QUtf8::convertFromUnicode(char *out, QStringView in, OnErrorLambda &&onError) noexcept
|
||||||
{
|
{
|
||||||
qsizetype len = in.size();
|
qsizetype len = in.size();
|
||||||
|
|
||||||
// create a QByteArray with the worst case scenario size
|
uchar *dst = reinterpret_cast<uchar *>(out);
|
||||||
QByteArray result(len * 3, Qt::Uninitialized);
|
|
||||||
uchar *dst = reinterpret_cast<uchar *>(const_cast<char *>(result.constData()));
|
|
||||||
const char16_t *src = reinterpret_cast<const char16_t *>(in.data());
|
const char16_t *src = reinterpret_cast<const char16_t *>(in.data());
|
||||||
const char16_t *const end = src + len;
|
const char16_t *const end = src + len;
|
||||||
|
|
||||||
@ -490,14 +489,27 @@ QByteArray QUtf8::convertFromUnicode(QStringView in)
|
|||||||
do {
|
do {
|
||||||
char16_t u = *src++;
|
char16_t u = *src++;
|
||||||
int res = QUtf8Functions::toUtf8<QUtf8BaseTraits>(u, dst, src, end);
|
int res = QUtf8Functions::toUtf8<QUtf8BaseTraits>(u, dst, src, end);
|
||||||
if (res < 0) {
|
if (Q_UNLIKELY(res < 0))
|
||||||
// encoding error - append '?'
|
onError(dst, u, res);
|
||||||
*dst++ = '?';
|
|
||||||
}
|
|
||||||
} while (src < nextAscii);
|
} while (src < nextAscii);
|
||||||
}
|
}
|
||||||
|
|
||||||
result.truncate(dst - reinterpret_cast<uchar *>(const_cast<char *>(result.constData())));
|
return reinterpret_cast<char *>(dst);
|
||||||
|
}
|
||||||
|
|
||||||
|
QByteArray QUtf8::convertFromUnicode(QStringView in)
|
||||||
|
{
|
||||||
|
qsizetype len = in.size();
|
||||||
|
|
||||||
|
// create a QByteArray with the worst case scenario size
|
||||||
|
QByteArray result(len * 3, Qt::Uninitialized);
|
||||||
|
char *dst = const_cast<char *>(result.constData());
|
||||||
|
dst = convertFromUnicode(dst, in, [](auto *dst, ...) {
|
||||||
|
// encoding error - append '?'
|
||||||
|
*dst++ = '?';
|
||||||
|
});
|
||||||
|
|
||||||
|
result.truncate(dst - result.constData());
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -548,35 +560,22 @@ char *QUtf8::convertFromUnicode(char *out, QStringView in, QStringConverter::Sta
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
while (src != end) {
|
out = reinterpret_cast<char *>(cursor);
|
||||||
const char16_t *nextAscii = end;
|
return convertFromUnicode(out, { src, end }, [&](uchar *&cursor, char16_t uc, int res) {
|
||||||
if (simdEncodeAscii(cursor, nextAscii, src, end))
|
if (res == QUtf8BaseTraits::Error) {
|
||||||
break;
|
// encoding error
|
||||||
|
++state->invalidChars;
|
||||||
do {
|
cursor = appendReplacementChar(cursor);
|
||||||
char16_t uc = *src++;
|
} else if (res == QUtf8BaseTraits::EndOfString) {
|
||||||
int res = QUtf8Functions::toUtf8<QUtf8BaseTraits>(uc, cursor, src, end);
|
if (state->flags & QStringConverter::Flag::Stateless) {
|
||||||
if (Q_LIKELY(res >= 0))
|
|
||||||
continue;
|
|
||||||
|
|
||||||
if (res == QUtf8BaseTraits::Error) {
|
|
||||||
// encoding error
|
|
||||||
++state->invalidChars;
|
++state->invalidChars;
|
||||||
cursor = appendReplacementChar(cursor);
|
cursor = appendReplacementChar(cursor);
|
||||||
} else if (res == QUtf8BaseTraits::EndOfString) {
|
} else {
|
||||||
if (state->flags & QStringConverter::Flag::Stateless) {
|
state->remainingChars = 1;
|
||||||
++state->invalidChars;
|
state->state_data[0] = uc;
|
||||||
cursor = appendReplacementChar(cursor);
|
|
||||||
} else {
|
|
||||||
state->remainingChars = 1;
|
|
||||||
state->state_data[0] = uc;
|
|
||||||
}
|
|
||||||
return reinterpret_cast<char *>(cursor);
|
|
||||||
}
|
}
|
||||||
} while (src < nextAscii);
|
}
|
||||||
}
|
});
|
||||||
|
|
||||||
return reinterpret_cast<char *>(cursor);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
char *QUtf8::convertFromLatin1(char *out, QLatin1StringView in)
|
char *QUtf8::convertFromLatin1(char *out, QLatin1StringView in)
|
||||||
@ -635,6 +634,21 @@ QString QUtf8::convertToUnicode(QByteArrayView in)
|
|||||||
QUtf8::convertToUnicode(QChar *, QByteArrayView) directly.
|
QUtf8::convertToUnicode(QChar *, QByteArrayView) directly.
|
||||||
*/
|
*/
|
||||||
char16_t *QUtf8::convertToUnicode(char16_t *dst, QByteArrayView in) noexcept
|
char16_t *QUtf8::convertToUnicode(char16_t *dst, QByteArrayView in) noexcept
|
||||||
|
{
|
||||||
|
// check if have to skip a BOM
|
||||||
|
auto bom = QByteArrayView::fromArray(utf8bom);
|
||||||
|
if (in.size() >= bom.size() && in.first(bom.size()) == bom)
|
||||||
|
in.slice(sizeof(utf8bom));
|
||||||
|
|
||||||
|
return convertToUnicode(dst, in, [](char16_t *&dst, ...) {
|
||||||
|
// decoding error
|
||||||
|
*dst++ = QChar::ReplacementCharacter;
|
||||||
|
return true; // continue decoding
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename OnErrorLambda> Q_ALWAYS_INLINE char16_t *
|
||||||
|
QUtf8::convertToUnicode(char16_t *dst, QByteArrayView in, OnErrorLambda &&onError) noexcept
|
||||||
{
|
{
|
||||||
const uchar *const start = reinterpret_cast<const uchar *>(in.data());
|
const uchar *const start = reinterpret_cast<const uchar *>(in.data());
|
||||||
const uchar *src = start;
|
const uchar *src = start;
|
||||||
@ -642,29 +656,20 @@ char16_t *QUtf8::convertToUnicode(char16_t *dst, QByteArrayView in) noexcept
|
|||||||
|
|
||||||
// attempt to do a full decoding in SIMD
|
// attempt to do a full decoding in SIMD
|
||||||
const uchar *nextAscii = end;
|
const uchar *nextAscii = end;
|
||||||
if (!simdDecodeAscii(dst, nextAscii, src, end)) {
|
while (src < end) {
|
||||||
// at least one non-ASCII entry
|
nextAscii = end;
|
||||||
// check if we failed to decode the UTF-8 BOM; if so, skip it
|
if (simdDecodeAscii(dst, nextAscii, src, end))
|
||||||
if (Q_UNLIKELY(src == start)
|
break;
|
||||||
&& end - src >= 3
|
|
||||||
&& Q_UNLIKELY(src[0] == utf8bom[0] && src[1] == utf8bom[1] && src[2] == utf8bom[2])) {
|
|
||||||
src += 3;
|
|
||||||
}
|
|
||||||
|
|
||||||
while (src < end) {
|
do {
|
||||||
nextAscii = end;
|
uchar b = *src++;
|
||||||
if (simdDecodeAscii(dst, nextAscii, src, end))
|
const qsizetype res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(b, dst, src, end);
|
||||||
break;
|
if (Q_LIKELY(res >= 0))
|
||||||
|
continue;
|
||||||
do {
|
// decoding error
|
||||||
uchar b = *src++;
|
if (!onError(dst, src, res))
|
||||||
const qsizetype res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(b, dst, src, end);
|
return dst;
|
||||||
if (res < 0) {
|
} while (src < nextAscii);
|
||||||
// decoding error
|
|
||||||
*dst++ = QChar::ReplacementCharacter;
|
|
||||||
}
|
|
||||||
} while (src < nextAscii);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return dst;
|
return dst;
|
||||||
@ -702,7 +707,6 @@ char16_t *QUtf8::convertToUnicode(char16_t *dst, QByteArrayView in, QStringConve
|
|||||||
replacement = QChar::Null;
|
replacement = QChar::Null;
|
||||||
|
|
||||||
qsizetype res;
|
qsizetype res;
|
||||||
uchar ch = 0;
|
|
||||||
|
|
||||||
const uchar *src = reinterpret_cast<const uchar *>(in.data());
|
const uchar *src = reinterpret_cast<const uchar *>(in.data());
|
||||||
const uchar *end = src + len;
|
const uchar *end = src + len;
|
||||||
@ -754,19 +758,16 @@ char16_t *QUtf8::convertToUnicode(char16_t *dst, QByteArrayView in, QStringConve
|
|||||||
|
|
||||||
// main body, stateless decoding
|
// main body, stateless decoding
|
||||||
res = 0;
|
res = 0;
|
||||||
const uchar *nextAscii = src;
|
dst = convertToUnicode(dst, { src, end }, [&](char16_t *&dst, const uchar *src_, int res_) {
|
||||||
while (res >= 0 && src < end) {
|
res = res_;
|
||||||
if (src >= nextAscii && simdDecodeAscii(dst, nextAscii, src, end))
|
src = src_;
|
||||||
break;
|
|
||||||
|
|
||||||
ch = *src++;
|
|
||||||
res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(ch, dst, src, end);
|
|
||||||
if (res == QUtf8BaseTraits::Error) {
|
if (res == QUtf8BaseTraits::Error) {
|
||||||
res = 0;
|
res = 0;
|
||||||
++state->invalidChars;
|
++state->invalidChars;
|
||||||
*dst++ = replacement;
|
*dst++ = replacement;
|
||||||
}
|
}
|
||||||
}
|
return res == 0; // continue if plain decoding error
|
||||||
|
});
|
||||||
|
|
||||||
if (res == QUtf8BaseTraits::EndOfString) {
|
if (res == QUtf8BaseTraits::EndOfString) {
|
||||||
// unterminated UTF sequence
|
// unterminated UTF sequence
|
||||||
|
@ -334,6 +334,12 @@ struct QUtf8
|
|||||||
Qt::CaseSensitivity cs = Qt::CaseSensitive);
|
Qt::CaseSensitivity cs = Qt::CaseSensitive);
|
||||||
static int compareUtf8(QByteArrayView lhs, QByteArrayView rhs,
|
static int compareUtf8(QByteArrayView lhs, QByteArrayView rhs,
|
||||||
Qt::CaseSensitivity cs = Qt::CaseSensitive) noexcept;
|
Qt::CaseSensitivity cs = Qt::CaseSensitive) noexcept;
|
||||||
|
|
||||||
|
private:
|
||||||
|
template <typename OnErrorLambda> static char *
|
||||||
|
convertFromUnicode(char *out, QStringView in, OnErrorLambda &&onError) noexcept;
|
||||||
|
template <typename OnErrorLambda> static char16_t *
|
||||||
|
convertToUnicode(char16_t *dst, QByteArrayView in, OnErrorLambda &&onError) noexcept;
|
||||||
};
|
};
|
||||||
|
|
||||||
struct QUtf16
|
struct QUtf16
|
||||||
|
Loading…
x
Reference in New Issue
Block a user