Add functions for fast bulk conversion of qfloat16
Both ARM and x86 can convert fp16 much faster in bulk than one at a time. This also enables hardware accelerated conversion on x86, when F16C isn't unconditionally available at compile time. This code is implemented in C to ensure that there's no leakage of inline symbols from the .obj file that was compiled by Visual Studio with AVX support. Unfortunately, simd.prf uses $(CXX) instead of $(CC) for all its sources, which means the file gets interpreted as C++ by g++, clang++ and icpc. Those compilers at least don't leak any symbols. Done-with: Thiago Macieira <thiago.macieira@intel.com> Change-Id: I9d26d99e83392861fb09564e0e8e8d76cd8483b3 Reviewed-by: Thiago Macieira <thiago.macieira@intel.com>
This commit is contained in:
parent
0ac2dca977
commit
b8e352ad37
@ -11,7 +11,7 @@ DEFINES += $$MODULE_DEFINES
|
|||||||
DEFINES += QT_NO_USING_NAMESPACE QT_NO_FOREACH
|
DEFINES += QT_NO_USING_NAMESPACE QT_NO_FOREACH
|
||||||
win32-msvc*|win32-icc:QMAKE_LFLAGS += /BASE:0x67000000
|
win32-msvc*|win32-icc:QMAKE_LFLAGS += /BASE:0x67000000
|
||||||
|
|
||||||
CONFIG += optimize_full
|
CONFIG += simd optimize_full
|
||||||
|
|
||||||
QMAKE_DOCS = $$PWD/doc/qtcore.qdocconf
|
QMAKE_DOCS = $$PWD/doc/qtcore.qdocconf
|
||||||
|
|
||||||
|
@ -39,6 +39,8 @@ SOURCES += \
|
|||||||
global/qrandom.cpp \
|
global/qrandom.cpp \
|
||||||
global/qhooks.cpp
|
global/qhooks.cpp
|
||||||
|
|
||||||
|
F16C_SOURCES += global/qfloat16_f16c.c
|
||||||
|
|
||||||
VERSIONTAGGING_SOURCES = global/qversiontagging.cpp
|
VERSIONTAGGING_SOURCES = global/qversiontagging.cpp
|
||||||
|
|
||||||
darwin: SOURCES += global/qoperatingsystemversion_darwin.mm
|
darwin: SOURCES += global/qoperatingsystemversion_darwin.mm
|
||||||
|
@ -38,6 +38,7 @@
|
|||||||
****************************************************************************/
|
****************************************************************************/
|
||||||
|
|
||||||
#include "qfloat16_p.h"
|
#include "qfloat16_p.h"
|
||||||
|
#include "private/qsimd_p.h"
|
||||||
|
|
||||||
QT_BEGIN_NAMESPACE
|
QT_BEGIN_NAMESPACE
|
||||||
|
|
||||||
@ -113,4 +114,88 @@ Q_REQUIRED_RESULT bool qIsFinite(qfloat16 f) Q_DECL_NOTHROW { return qt_is_finit
|
|||||||
exactness is stronger the smaller the numbers are.
|
exactness is stronger the smaller the numbers are.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
#if QT_COMPILER_SUPPORTS(F16C)
|
||||||
|
static inline bool hasFastF16()
|
||||||
|
{
|
||||||
|
// All processors with F16C also support AVX, but YMM registers
|
||||||
|
// might not be supported by the OS, or they might be disabled.
|
||||||
|
return qCpuHasFeature(F16C) && qCpuHasFeature(AVX);
|
||||||
|
}
|
||||||
|
|
||||||
|
extern "C" {
|
||||||
|
extern void qFloatToFloat16_fast(quint16 *out, const float *in, qssize_t len) Q_DECL_NOTHROW;
|
||||||
|
extern void qFloatFromFloat16_fast(float *out, const quint16 *in, qssize_t len) Q_DECL_NOTHROW;
|
||||||
|
}
|
||||||
|
|
||||||
|
#elif defined(__ARM_FP16_FORMAT_IEEE) && defined(__ARM_NEON__)
|
||||||
|
static inline bool hasFastF16()
|
||||||
|
{
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void qFloatToFloat16_fast(quint16 *out, const float *in, qssize_t len) Q_DECL_NOTHROW
|
||||||
|
{
|
||||||
|
__fp16 *out_f16 = reinterpret_cast<__fp16 *>(out);
|
||||||
|
qssize_t i = 0;
|
||||||
|
for (; i < len - 3; i += 4)
|
||||||
|
vst1_f16(out_f16 + i, vcvt_f16_f32(vld1q_f32(in + i)));
|
||||||
|
SIMD_EPILOGUE(i, len, 3)
|
||||||
|
out_f16[i] = __fp16(in[i]);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void qFloatFromFloat16_fast(quint16 *out, const quint16 *in, qssize_t len) Q_DECL_NOTHROW
|
||||||
|
{
|
||||||
|
const __fp16 *in_f16 = reinterpret_cast<const __fp16 *>(in);
|
||||||
|
qssize_t i = 0;
|
||||||
|
for (; i < len - 3; i += 4)
|
||||||
|
vst1q_f32(out + i, vcvt_f32_f16(vld1_f16(in_f16 + i)));
|
||||||
|
SIMD_EPILOGUE(i, len, 3)
|
||||||
|
out[i] = float(in_f16[i]);
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
static inline bool hasFastF16()
|
||||||
|
{
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void qFloatToFloat16_fast(quint16 *, const float *, qssize_t) Q_DECL_NOTHROW
|
||||||
|
{
|
||||||
|
Q_UNREACHABLE();
|
||||||
|
}
|
||||||
|
|
||||||
|
static void qFloatFromFloat16_fast(float *, const quint16 *, qssize_t) Q_DECL_NOTHROW
|
||||||
|
{
|
||||||
|
Q_UNREACHABLE();
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
/*!
|
||||||
|
\since 5.11
|
||||||
|
|
||||||
|
Converts \a len floats from \a in to qfloat16 and stores them in \a out.
|
||||||
|
Both \a in and \a out must have \a len allocated entries.
|
||||||
|
*/
|
||||||
|
Q_CORE_EXPORT void qFloatToFloat16(qfloat16 *out, const float *in, qssize_t len) Q_DECL_NOTHROW
|
||||||
|
{
|
||||||
|
if (hasFastF16())
|
||||||
|
return qFloatToFloat16_fast(reinterpret_cast<quint16 *>(out), in, len);
|
||||||
|
|
||||||
|
for (qssize_t i = 0; i < len; ++i)
|
||||||
|
out[i] = qfloat16(in[i]);
|
||||||
|
}
|
||||||
|
|
||||||
|
/*!
|
||||||
|
\since 5.11
|
||||||
|
|
||||||
|
Converts \a len qfloat16 from \a in to floats and stores them in \a out.
|
||||||
|
Both \a in and \a out must have \a len allocated entries.
|
||||||
|
*/
|
||||||
|
Q_CORE_EXPORT void qFloatFromFloat16(float *out, const qfloat16 *in, qssize_t len) Q_DECL_NOTHROW
|
||||||
|
{
|
||||||
|
if (hasFastF16())
|
||||||
|
return qFloatFromFloat16_fast(out, reinterpret_cast<const quint16 *>(in), len);
|
||||||
|
|
||||||
|
for (qssize_t i = 0; i < len; ++i)
|
||||||
|
out[i] = float(in[i]);
|
||||||
|
}
|
||||||
|
|
||||||
QT_END_NAMESPACE
|
QT_END_NAMESPACE
|
||||||
|
@ -79,6 +79,9 @@ private:
|
|||||||
|
|
||||||
Q_DECLARE_TYPEINFO(qfloat16, Q_PRIMITIVE_TYPE);
|
Q_DECLARE_TYPEINFO(qfloat16, Q_PRIMITIVE_TYPE);
|
||||||
|
|
||||||
|
Q_CORE_EXPORT void qFloatToFloat16(qfloat16 *, const float *, qssize_t length) Q_DECL_NOTHROW;
|
||||||
|
Q_CORE_EXPORT void qFloatFromFloat16(float *, const qfloat16 *, qssize_t length) Q_DECL_NOTHROW;
|
||||||
|
|
||||||
Q_REQUIRED_RESULT Q_CORE_EXPORT bool qIsInf(qfloat16 f) Q_DECL_NOTHROW; // complements qnumeric.h
|
Q_REQUIRED_RESULT Q_CORE_EXPORT bool qIsInf(qfloat16 f) Q_DECL_NOTHROW; // complements qnumeric.h
|
||||||
Q_REQUIRED_RESULT Q_CORE_EXPORT bool qIsNaN(qfloat16 f) Q_DECL_NOTHROW; // complements qnumeric.h
|
Q_REQUIRED_RESULT Q_CORE_EXPORT bool qIsNaN(qfloat16 f) Q_DECL_NOTHROW; // complements qnumeric.h
|
||||||
Q_REQUIRED_RESULT Q_CORE_EXPORT bool qIsFinite(qfloat16 f) Q_DECL_NOTHROW; // complements qnumeric.h
|
Q_REQUIRED_RESULT Q_CORE_EXPORT bool qIsFinite(qfloat16 f) Q_DECL_NOTHROW; // complements qnumeric.h
|
||||||
|
87
src/corelib/global/qfloat16_f16c.c
Normal file
87
src/corelib/global/qfloat16_f16c.c
Normal file
@ -0,0 +1,87 @@
|
|||||||
|
/****************************************************************************
|
||||||
|
**
|
||||||
|
** Copyright (C) 2017 The Qt Company Ltd.
|
||||||
|
** Contact: https://www.qt.io/licensing/
|
||||||
|
**
|
||||||
|
** This file is part of the QtCore module of the Qt Toolkit.
|
||||||
|
**
|
||||||
|
** $QT_BEGIN_LICENSE:LGPL$
|
||||||
|
** Commercial License Usage
|
||||||
|
** Licensees holding valid commercial Qt licenses may use this file in
|
||||||
|
** accordance with the commercial license agreement provided with the
|
||||||
|
** Software or, alternatively, in accordance with the terms contained in
|
||||||
|
** a written agreement between you and The Qt Company. For licensing terms
|
||||||
|
** and conditions see https://www.qt.io/terms-conditions. For further
|
||||||
|
** information use the contact form at https://www.qt.io/contact-us.
|
||||||
|
**
|
||||||
|
** GNU Lesser General Public License Usage
|
||||||
|
** Alternatively, this file may be used under the terms of the GNU Lesser
|
||||||
|
** General Public License version 3 as published by the Free Software
|
||||||
|
** Foundation and appearing in the file LICENSE.LGPL3 included in the
|
||||||
|
** packaging of this file. Please review the following information to
|
||||||
|
** ensure the GNU Lesser General Public License version 3 requirements
|
||||||
|
** will be met: https://www.gnu.org/licenses/lgpl-3.0.html.
|
||||||
|
**
|
||||||
|
** GNU General Public License Usage
|
||||||
|
** Alternatively, this file may be used under the terms of the GNU
|
||||||
|
** General Public License version 2.0 or (at your option) the GNU General
|
||||||
|
** Public license version 3 or any later version approved by the KDE Free
|
||||||
|
** Qt Foundation. The licenses are as published by the Free Software
|
||||||
|
** Foundation and appearing in the file LICENSE.GPL2 and LICENSE.GPL3
|
||||||
|
** included in the packaging of this file. Please review the following
|
||||||
|
** information to ensure the GNU General Public License requirements will
|
||||||
|
** be met: https://www.gnu.org/licenses/gpl-2.0.html and
|
||||||
|
** https://www.gnu.org/licenses/gpl-3.0.html.
|
||||||
|
**
|
||||||
|
** $QT_END_LICENSE$
|
||||||
|
**
|
||||||
|
****************************************************************************/
|
||||||
|
|
||||||
|
#include "private/qsimd_p.h"
|
||||||
|
|
||||||
|
// The x86 F16C instructions operate on AVX registers, so AVX support is
|
||||||
|
// required. We don't need to check for __F16C__ because we this file wouldn't
|
||||||
|
// have been compiled if the support was missing in the first place, and not
|
||||||
|
// all compilers define it. Technically, we didn't need to check for __AVX__
|
||||||
|
// either.
|
||||||
|
#if !defined(__AVX__)
|
||||||
|
# error "AVX support required"
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
QT_BEGIN_NAMESPACE
|
||||||
|
extern "C" {
|
||||||
|
#endif
|
||||||
|
|
||||||
|
void qFloatToFloat16_fast(quint16 *out, const float *in, qssize_t len) Q_DECL_NOTHROW
|
||||||
|
{
|
||||||
|
qssize_t i = 0;
|
||||||
|
for (; i < len - 7; i += 8)
|
||||||
|
_mm_storeu_si128((__m128i *)(out + i), _mm256_cvtps_ph(_mm256_loadu_ps(in + i), 0));
|
||||||
|
if (i < len - 3) {
|
||||||
|
_mm_storel_epi64((__m128i *)(out + i), _mm_cvtps_ph(_mm_loadu_ps(in + i), 0));
|
||||||
|
i += 4;
|
||||||
|
}
|
||||||
|
// Inlining "quint16::quint16(float f)" to avoid getting the fallback version.
|
||||||
|
SIMD_EPILOGUE(i, len, 3)
|
||||||
|
out[i] = _mm_extract_epi16(_mm_cvtps_ph(_mm_set_ss(in[i]), 0), 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
void qFloatFromFloat16_fast(float *out, const quint16 *in, qssize_t len) Q_DECL_NOTHROW
|
||||||
|
{
|
||||||
|
qssize_t i = 0;
|
||||||
|
for (; i < len - 7; i += 8)
|
||||||
|
_mm256_storeu_ps(out + i, _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)(in + i))));
|
||||||
|
if (i < len - 3) {
|
||||||
|
_mm_storeu_ps(out + i, _mm_cvtph_ps(_mm_loadl_epi64((const __m128i *)(in + i))));
|
||||||
|
i += 4;
|
||||||
|
}
|
||||||
|
// Inlining "quint16::operator float()" to avoid getting the fallback version.
|
||||||
|
SIMD_EPILOGUE(i, len, 3)
|
||||||
|
out[i] = _mm_cvtss_f32(_mm_cvtph_ps(_mm_cvtsi32_si128(in[i])));
|
||||||
|
}
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
} // extern "C"
|
||||||
|
QT_END_NAMESPACE
|
||||||
|
#endif
|
@ -246,6 +246,15 @@ typedef unsigned long long quint64; /* 64 bit unsigned */
|
|||||||
typedef qint64 qlonglong;
|
typedef qint64 qlonglong;
|
||||||
typedef quint64 qulonglong;
|
typedef quint64 qulonglong;
|
||||||
|
|
||||||
|
#ifndef __cplusplus
|
||||||
|
// In C++ mode, we define below using QIntegerForSize template
|
||||||
|
Q_STATIC_ASSERT_X(sizeof(ptrdiff_t) == sizeof(size_t), "Weird ptrdiff_t and size_t definitions");
|
||||||
|
typedef ptrdiff_t qptrdiff;
|
||||||
|
typedef ptrdiff_t qssize_t;
|
||||||
|
typedef ptrdiff_t qintptr;
|
||||||
|
typedef size_t quintptr;
|
||||||
|
#endif
|
||||||
|
|
||||||
/*
|
/*
|
||||||
Useful type definitions for Qt
|
Useful type definitions for Qt
|
||||||
*/
|
*/
|
||||||
|
@ -223,4 +223,3 @@ TR_EXCLUDE += ../3rdparty/*
|
|||||||
# MIPS DSP
|
# MIPS DSP
|
||||||
MIPS_DSP_ASM += tools/qstring_mips_dsp_asm.S
|
MIPS_DSP_ASM += tools/qstring_mips_dsp_asm.S
|
||||||
MIPS_DSP_HEADERS += ../gui/painting/qt_mips_asm_dsp_p.h
|
MIPS_DSP_HEADERS += ../gui/painting/qt_mips_asm_dsp_p.h
|
||||||
CONFIG += simd
|
|
||||||
|
@ -46,6 +46,8 @@ private slots:
|
|||||||
void promotionTests();
|
void promotionTests();
|
||||||
void arithOps_data();
|
void arithOps_data();
|
||||||
void arithOps();
|
void arithOps();
|
||||||
|
void floatToFloat16();
|
||||||
|
void floatFromFloat16();
|
||||||
};
|
};
|
||||||
|
|
||||||
void tst_qfloat16::fuzzyCompare_data()
|
void tst_qfloat16::fuzzyCompare_data()
|
||||||
@ -305,5 +307,41 @@ void tst_qfloat16::arithOps()
|
|||||||
QVERIFY(qFuzzyCompare(r4,1.f/val2));
|
QVERIFY(qFuzzyCompare(r4,1.f/val2));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void tst_qfloat16::floatToFloat16()
|
||||||
|
{
|
||||||
|
float in[63];
|
||||||
|
qfloat16 out[63];
|
||||||
|
qfloat16 expected[63];
|
||||||
|
|
||||||
|
for (int i = 0; i < 63; ++i)
|
||||||
|
in[i] = i * (1/13.f);
|
||||||
|
|
||||||
|
for (int i = 0; i < 63; ++i)
|
||||||
|
expected[i] = qfloat16(in[i]);
|
||||||
|
|
||||||
|
qFloatToFloat16(out, in, 63);
|
||||||
|
|
||||||
|
for (int i = 0; i < 63; ++i)
|
||||||
|
QVERIFY(qFuzzyCompare(out[i], expected[i]));
|
||||||
|
}
|
||||||
|
|
||||||
|
void tst_qfloat16::floatFromFloat16()
|
||||||
|
{
|
||||||
|
qfloat16 in[35];
|
||||||
|
float out[35];
|
||||||
|
float expected[35];
|
||||||
|
|
||||||
|
for (int i = 0; i < 35; ++i)
|
||||||
|
in[i] = qfloat16(i * (17.f / 3));
|
||||||
|
|
||||||
|
for (int i = 0; i < 35; ++i)
|
||||||
|
expected[i] = float(in[i]);
|
||||||
|
|
||||||
|
qFloatFromFloat16(out, in, 35);
|
||||||
|
|
||||||
|
for (int i = 0; i < 35; ++i)
|
||||||
|
QCOMPARE(out[i], expected[i]);
|
||||||
|
}
|
||||||
|
|
||||||
QTEST_APPLESS_MAIN(tst_qfloat16)
|
QTEST_APPLESS_MAIN(tst_qfloat16)
|
||||||
#include "tst_qfloat16.moc"
|
#include "tst_qfloat16.moc"
|
||||||
|
@ -71,6 +71,12 @@ void tst_GlobalTypes()
|
|||||||
|
|
||||||
qreal qr;
|
qreal qr;
|
||||||
Q_UNUSED(qr);
|
Q_UNUSED(qr);
|
||||||
|
|
||||||
|
qssize_t qs;
|
||||||
|
qptrdiff qp;
|
||||||
|
qintptr qip;
|
||||||
|
quintptr qup;
|
||||||
|
Q_UNUSED(qs); Q_UNUSED(qp); Q_UNUSED(qip); Q_UNUSED(qup);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Qt version */
|
/* Qt version */
|
||||||
|
Loading…
x
Reference in New Issue
Block a user