diff --git a/src/corelib/tools/qhash.cpp b/src/corelib/tools/qhash.cpp index 591a1ca1c6c..80771417955 100644 --- a/src/corelib/tools/qhash.cpp +++ b/src/corelib/tools/qhash.cpp @@ -616,8 +616,39 @@ namespace { // the scrambling round (step 3 in [1]) because it's just very good at // spreading the bits around. // + // Note on Latin-1 hashing (ZX == ByteToWord): for simplicity of the + // algorithm, we pass sizes equivalent to the UTF-16 content (ZX == None). + // That means we must multiply by 2 on entry, divide by 2 on pointer + // advancing, and load half as much data from memory (though we produce + // exactly as much data in registers). The compilers appear to optimize + // this out. + // // [1] https://en.wikipedia.org/wiki/Advanced_Encryption_Standard#High-level_description_of_the_algorithm + template static const T *advance(const T *ptr, ptrdiff_t n) + { + if constexpr (ZX == None) + return ptr + n; + + // see note above on ZX == ByteToWord hashing + auto p = reinterpret_cast(ptr); + n *= sizeof(T); + return reinterpret_cast(p + n/2); + } + + template static __m128i loadu128(const void *ptr); + template <> Q_ALWAYS_INLINE QT_FUNCTION_TARGET(AES) __m128i loadu128(const void *ptr) + { + return _mm_loadu_si128(reinterpret_cast(ptr)); + } + template <> Q_ALWAYS_INLINE QT_FUNCTION_TARGET(AES) __m128i loadu128(const void *ptr) + { + // use a MOVQ followed by PMOVZXBW + // the compiler usually combines them as a single, loading PMOVZXBW + __m128i data = _mm_loadl_epi64(static_cast(ptr)); + return _mm_cvtepu8_epi16(data); + } + // hash 16 bytes, running 3 scramble rounds of AES on itself (like label "final1") static void Q_ALWAYS_INLINE QT_FUNCTION_TARGET(AES) QT_VECTORCALL hash16bytes(__m128i &state0, __m128i data) @@ -629,11 +660,12 @@ namespace { } // hash twice 16 bytes, running 2 scramble rounds of AES on itself + template static void QT_FUNCTION_TARGET(AES) QT_VECTORCALL hash2x16bytes(__m128i &state0, __m128i &state1, const __m128i *src0, const __m128i *src1) { - __m128i data0 = _mm_loadu_si128(src0); - __m128i data1 = _mm_loadu_si128(src1); + __m128i data0 = loadu128(src0); + __m128i data1 = loadu128(src1); state0 = _mm_xor_si128(data0, state0); state1 = _mm_xor_si128(data1, state1); state0 = _mm_aesenc_si128(state0, state0); @@ -680,16 +712,18 @@ Q_ALWAYS_INLINE __m128i AESHashSeed::state1() const } } +template static size_t QT_FUNCTION_TARGET(AES) QT_VECTORCALL aeshash128_16to32(__m128i state0, __m128i state1, const __m128i *src, const __m128i *srcend) { { - if (src + 1 < srcend) { + const __m128i *src2 = advance(srcend, -1); + if (advance(src, 1) < srcend) { // epilogue: between 16 and 31 bytes - hash2x16bytes(state0, state1, src, srcend - 1); + hash2x16bytes(state0, state1, src, src2); } else if (src != srcend) { // epilogue: between 1 and 16 bytes, overlap with the end - __m128i data = _mm_loadu_si128(srcend - 1); + __m128i data = loadu128(src2); hash16bytes(state0, data); } @@ -700,8 +734,21 @@ aeshash128_16to32(__m128i state0, __m128i state1, const __m128i *src, const __m1 return mm_cvtsi128_sz(state0); } +// load all 16 bytes and mask off the bytes past the end of the source +static const qint8 maskarray[] = { + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +}; + +// load 16 bytes ending at the data end, then shuffle them to the beginning +static const qint8 shufflecontrol[] = { + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 +}; + +template static size_t QT_FUNCTION_TARGET(AES) QT_VECTORCALL -aeshash128_lt16(__m128i state0, const uchar *p, size_t len) +aeshash128_lt16(__m128i state0, const __m128i *src, const __m128i *srcend, size_t len) { if (len) { // We're going to load 16 bytes and mask zero the part we don't care @@ -712,25 +759,15 @@ aeshash128_lt16(__m128i state0, const uchar *p, size_t len) constexpr quintptr PageSize = 4096; __m128i data; - if ((quintptr(p) & (PageSize / 2)) == 0) { + if ((quintptr(src) & (PageSize / 2)) == 0) { // lower half of the page: - // load all 16 bytes and mask off the bytes past the end of the source - static const qint8 maskarray[] = { - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - }; __m128i mask = _mm_loadu_si128(reinterpret_cast(maskarray + 15 - len)); - data = _mm_loadu_si128(reinterpret_cast(p)); + data = loadu128(src); data = _mm_and_si128(data, mask); } else { // upper half of the page: - // load 16 bytes ending at the data end, then shuffle them to the beginning - static const qint8 shufflecontrol[] = { - 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 - }; __m128i control = _mm_loadu_si128(reinterpret_cast(shufflecontrol + 15 - len)); - data = _mm_loadu_si128(reinterpret_cast(p + len) - 1); + data = loadu128(advance(srcend, -1)); data = _mm_shuffle_epi8(data, control); } @@ -739,24 +776,45 @@ aeshash128_lt16(__m128i state0, const uchar *p, size_t len) return mm_cvtsi128_sz(state0); } +template static size_t QT_FUNCTION_TARGET(AES) QT_VECTORCALL aeshash128_ge32(__m128i state0, __m128i state1, const __m128i *src, const __m128i *srcend) { // main loop: scramble two 16-byte blocks - for ( ; src + 2 < srcend; src += 2) - hash2x16bytes(state0, state1, src, src + 1); + for ( ; advance(src, 2) < srcend; src = advance(src, 2)) + hash2x16bytes(state0, state1, src, advance(src, 1)); - return aeshash128_16to32(state0, state1, src, srcend); + return aeshash128_16to32(state0, state1, src, srcend); } # if QT_COMPILER_SUPPORTS_HERE(VAES) +template static __m256i loadu256(const void *ptr); +template <> Q_ALWAYS_INLINE QT_FUNCTION_TARGET(VAES) __m256i loadu256(const void *ptr) +{ + return _mm256_loadu_si256(reinterpret_cast(ptr)); +} +template <> Q_ALWAYS_INLINE QT_FUNCTION_TARGET(VAES) __m256i loadu256(const void *ptr) +{ + // VPMOVZXBW xmm, ymm + __m128i data = _mm_loadu_si128(reinterpret_cast(ptr)); + return _mm256_cvtepu8_epi16(data); +} + +template static size_t QT_FUNCTION_TARGET(VAES_AVX512) QT_VECTORCALL aeshash256_lt32_avx256(__m256i state0, const uchar *p, size_t len) { __m128i state0_128 = _mm256_castsi256_si128(state0); if (len) { - __mmask32 mask = _bzhi_u32(-1, unsigned(len)); - __m256i data = _mm256_maskz_loadu_epi8(mask, p); + __m256i data; + if constexpr (ZX == None) { + __mmask32 mask = _bzhi_u32(-1, unsigned(len)); + data = _mm256_maskz_loadu_epi8(mask, p); + } else { + __mmask16 mask = _bzhi_u32(-1, unsigned(len) / 2); + __m128i data0 = _mm_maskz_loadu_epi8(mask, p); + data = _mm256_cvtepu8_epi16(data0); + } __m128i data0 = _mm256_castsi256_si128(data); if (len >= sizeof(__m128i)) { state0 = _mm256_xor_si256(state0, data); @@ -776,8 +834,9 @@ aeshash256_lt32_avx256(__m256i state0, const uchar *p, size_t len) return mm_cvtsi128_sz(state0_128); } +template static size_t QT_FUNCTION_TARGET(VAES) QT_VECTORCALL -aeshash256_ge32(__m256i state0, const uchar *p, size_t len) +aeshash256_ge32(__m256i state0, const __m128i *s, const __m128i *end, size_t len) { static const auto hash32bytes = [](__m256i &state0, __m256i data) QT_FUNCTION_TARGET(VAES) { state0 = _mm256_xor_si256(state0, data); @@ -787,10 +846,10 @@ aeshash256_ge32(__m256i state0, const uchar *p, size_t len) }; // hash twice 32 bytes, running 2 scramble rounds of AES on itself - const auto hash2x32bytes = [](__m256i &state0, __m256i &state1, const __m256i *src0, - const __m256i *src1) QT_FUNCTION_TARGET(VAES) { - __m256i data0 = _mm256_loadu_si256(src0); - __m256i data1 = _mm256_loadu_si256(src1); + const auto hash2x32bytes = [](__m256i &state0, __m256i &state1, const void *src0, + const void *src1) QT_FUNCTION_TARGET(VAES) { + __m256i data0 = loadu256(src0); + __m256i data1 = loadu256(src1); state0 = _mm256_xor_si256(data0, state0); state1 = _mm256_xor_si256(data1, state1); state0 = _mm256_aesenc_epi128(state0, state0); @@ -799,21 +858,22 @@ aeshash256_ge32(__m256i state0, const uchar *p, size_t len) state1 = _mm256_aesenc_epi128(state1, state1); }; - const __m256i *src = reinterpret_cast(p); - const __m256i *srcend = reinterpret_cast(p + len); + const __m256i *src = reinterpret_cast(s); + const __m256i *srcend = reinterpret_cast(end); __m256i state1 = _mm256_aesenc_epi128(state0, mm256_set1_epz(len)); // main loop: scramble two 32-byte blocks - for ( ; src + 2 < srcend; src += 2) - hash2x32bytes(state0, state1, src, src + 1); + for ( ; advance(src, 2) < srcend; src = advance(src, 2)) + hash2x32bytes(state0, state1, src, advance(src, 1)); - if (src + 1 < srcend) { + const __m256i *src2 = advance(srcend, -1); + if (advance(src, 1) < srcend) { // epilogue: between 32 and 31 bytes - hash2x32bytes(state0, state1, src, srcend - 1); + hash2x32bytes(state0, state1, src, src2); } else if (src != srcend) { // epilogue: between 1 and 32 bytes, overlap with the end - __m256i data = _mm256_loadu_si256(srcend - 1); + __m256i data = loadu256(src2); hash32bytes(state0, data); } @@ -826,59 +886,69 @@ aeshash256_ge32(__m256i state0, const uchar *p, size_t len) return mm_cvtsi128_sz(_mm_xor_si128(low, high)); } +template static size_t QT_FUNCTION_TARGET(VAES) aeshash256(const uchar *p, size_t len, size_t seed, size_t seed2) noexcept { AESHashSeed state(seed, seed2); auto src = reinterpret_cast(p); - const auto srcend = reinterpret_cast(p + len); + const auto srcend = reinterpret_cast(advance(p, len)); if (len < sizeof(__m128i)) - return aeshash128_lt16(state.state0, p, len); + return aeshash128_lt16(state.state0, src, srcend, len); if (len <= sizeof(__m256i)) - return aeshash128_16to32(state.state0, state.state1(), src, srcend); + return aeshash128_16to32(state.state0, state.state1(), src, srcend); - return aeshash256_ge32(state.state0_256(), p, len); + return aeshash256_ge32(state.state0_256(), src, srcend, len); } +template static size_t QT_FUNCTION_TARGET(VAES_AVX512) aeshash256_avx256(const uchar *p, size_t len, size_t seed, size_t seed2) noexcept { AESHashSeed state(seed, seed2); - if (len <= sizeof(__m256i)) - return aeshash256_lt32_avx256(state.state0_256(), p, len); + auto src = reinterpret_cast(p); + const auto srcend = reinterpret_cast(advance(p, len)); - return aeshash256_ge32(state.state0_256(), p, len); + if (len <= sizeof(__m256i)) + return aeshash256_lt32_avx256(state.state0_256(), p, len); + + return aeshash256_ge32(state.state0_256(), src, srcend, len); } # endif // VAES +template static size_t QT_FUNCTION_TARGET(AES) aeshash128(const uchar *p, size_t len, size_t seed, size_t seed2) noexcept { AESHashSeed state(seed, seed2); auto src = reinterpret_cast(p); - const auto srcend = reinterpret_cast(p + len); + const auto srcend = reinterpret_cast(advance(p, len)); if (len < sizeof(__m128i)) - return aeshash128_lt16(state.state0, p, len); + return aeshash128_lt16(state.state0, src, srcend, len); if (len <= sizeof(__m256i)) - return aeshash128_16to32(state.state0, state.state1(), src, srcend); + return aeshash128_16to32(state.state0, state.state1(), src, srcend); - return aeshash128_ge32(state.state0, state.state1(), src, srcend); + return aeshash128_ge32(state.state0, state.state1(), src, srcend); } +template static size_t aeshash(const uchar *p, size_t len, size_t seed, size_t seed2) noexcept { + if constexpr (ZX == ByteToWord) + len *= 2; // see note above on ZX == ByteToWord hashing + # if QT_COMPILER_SUPPORTS_HERE(VAES) if (qCpuHasFeature(VAES)) { if (qCpuHasFeature(AVX512VL)) - return aeshash256_avx256(p, len, seed, seed2); - return aeshash256(p, len, seed, seed2); + return aeshash256_avx256(p, len, seed, seed2); + return aeshash256(p, len, seed, seed2); } # endif - return aeshash128(p, len, seed, seed2); + return aeshash128(p, len, seed, seed2); } #endif // x86 AESNI @@ -1090,6 +1160,10 @@ size_t qHash(QLatin1StringView key, size_t seed) noexcept if (seed) seed2 = qt_qhash_seed.currentSeed(1); +#if defined(AESHASH) + if (seed && qCpuHasFeature(AES) && qCpuHasFeature(SSE4_2)) + return aeshash(data, size, seed, seed2); +#endif return qHashBits_fallback(data, size, seed, seed2); } diff --git a/tests/auto/corelib/tools/qhashfunctions/tst_qhashfunctions.cpp b/tests/auto/corelib/tools/qhashfunctions/tst_qhashfunctions.cpp index fdb2b373469..06f18dfe9c2 100644 --- a/tests/auto/corelib/tools/qhashfunctions/tst_qhashfunctions.cpp +++ b/tests/auto/corelib/tools/qhashfunctions/tst_qhashfunctions.cpp @@ -289,10 +289,12 @@ void tst_QHashFunctions::stringConsistency_data() QTest::newRow("null") << QString(); QTest::newRow("empty") << ""; QTest::newRow("withnull") << QStringLiteral("A\0z"); - QTest::newRow("short-ascii") << "Hello"; + QTest::newRow("short-ascii") << "Hello"; // 10 bytes + QTest::newRow("medium-ascii") << "Hello, World"; // 24 bytes QTest::newRow("long-ascii") << QStringLiteral("abcdefghijklmnopqrstuvxyz").repeated(16); QTest::newRow("short-latin1") << "Bokmål"; + QTest::newRow("medium-latin1") << "Det går bra!"; // 24 bytes QTest::newRow("long-latin1") << R"(Alle mennesker er født frie og med samme menneskeverd og menneskerettigheter. De er utstyrt med fornuft og samvittighet og bør handle mot hverandre i brorskapets ånd.)"; @@ -327,8 +329,6 @@ void tst_QHashFunctions::stringConsistency() QLatin1StringView l1sv(l1ba.data(), l1ba.size()); #ifdef Q_PROCESSOR_ARM // zero-extending aeshash not implemented on ARM -#elif defined(Q_PROCESSOR_X86) - // zero-extending aeshash not implemented on x86 #else if (value == l1sv) QCOMPARE(qHash(l1sv, seed), qHash(value, seed)); diff --git a/tests/benchmarks/corelib/tools/qhash/outofline.cpp b/tests/benchmarks/corelib/tools/qhash/outofline.cpp index f44746c5b1b..5b16c36ffbb 100644 --- a/tests/benchmarks/corelib/tools/qhash/outofline.cpp +++ b/tests/benchmarks/corelib/tools/qhash/outofline.cpp @@ -5,7 +5,7 @@ QT_BEGIN_NAMESPACE -size_t qHash(const Qt4String &str) +size_t qHash(const Qt4String &str, size_t /* never used */) { qsizetype n = str.size(); const QChar *p = str.unicode(); @@ -40,7 +40,7 @@ size_t qHash(const Qt50String &key, size_t seed) // Still, we can avoid writing the multiplication as "(h << 5) - h" // -- the compiler will turn it into a shift and an addition anyway // (for instance, gcc 4.4 does that even at -O0). -size_t qHash(const JavaString &str) +size_t qHash(const JavaString &str, size_t /* never used */) { const auto *p = reinterpret_cast(str.constData()); const qsizetype len = str.size(); diff --git a/tests/benchmarks/corelib/tools/qhash/tst_bench_qhash.cpp b/tests/benchmarks/corelib/tools/qhash/tst_bench_qhash.cpp index f6214dfa360..1a62a484372 100644 --- a/tests/benchmarks/corelib/tools/qhash/tst_bench_qhash.cpp +++ b/tests/benchmarks/corelib/tools/qhash/tst_bench_qhash.cpp @@ -13,6 +13,8 @@ #include #include +static constexpr quint64 RandomSeed32 = 1045982819; +static constexpr quint64 RandomSeed64 = QtPrivate::QHashCombine{}(RandomSeed32, RandomSeed32); class tst_QHash : public QObject { @@ -31,6 +33,8 @@ private slots: void hashing_current_data() { data(); } void hashing_current() { hashing_template(); } + void hashing_qbytearray_data() { data(); } + void hashing_qbytearray() { hashing_template(); } void hashing_qt50_data() { data(); } void hashing_qt50() { hashing_template(); } void hashing_qt4_data() { data(); } @@ -38,15 +42,25 @@ private slots: void hashing_javaString_data() { data(); } void hashing_javaString() { hashing_template(); } + void hashing_nonzero_current_data() { data(); } + void hashing_nonzero_current() { hashing_nonzero_template(); } + void hashing_nonzero_qbytearray_data() { data(); } + void hashing_nonzero_qbytearray() { hashing_nonzero_template(); } + void hashing_nonzero_qlatin1string_data() { data(); } + void hashing_nonzero_qlatin1string() { hashing_nonzero_template(); } + private: void data(); template void qhash_template(); - template void hashing_template(); + template void hashing_template(); + template void hashing_nonzero_template() + { hashing_template(); } QStringList smallFilePaths; QStringList uuids; QStringList dict; QStringList numbers; + QStringList longstrings; }; ///////////////////// QHash ///////////////////// @@ -68,10 +82,12 @@ void tst_QHash::initTestCase() // guaranteed to be completely random, generated by http://xkcd.com/221/ QUuid ns = QUuid("{f43d2ef3-2fe9-4563-a6f5-5a0100c2d699}"); uuids.reserve(smallFilePaths.size()); + longstrings.reserve(smallFilePaths.size()); foreach (const QString &path, smallFilePaths) uuids.append(QUuid::createUuidV5(ns, path).toString()); - + for (qsizetype i = 0; i < uuids.size(); ++i) + longstrings.append(uuids.at(i).repeated(8)); // lots of strings with alphabetical characters, vaguely reminiscent of // a dictionary. @@ -112,6 +128,7 @@ void tst_QHash::data() QTest::addColumn("items"); QTest::newRow("paths-small") << smallFilePaths; QTest::newRow("uuids-list") << uuids; + QTest::newRow("longstrings-list") << longstrings; QTest::newRow("dictionary") << dict; QTest::newRow("numbers") << numbers; } @@ -132,19 +149,30 @@ template void tst_QHash::qhash_template() } } -template void tst_QHash::hashing_template() +template void tst_QHash::hashing_template() { // just the hashing function QFETCH(QStringList, items); QList realitems; realitems.reserve(items.size()); - foreach (const QString &s, items) - realitems.append(s); + foreach (const QString &s, items) { + if constexpr (std::is_same_v) { + realitems.append(s); + } else if constexpr (sizeof(typename String::value_type) == 1) { + realitems.append(String(s.toLatin1())); + } + } QBENCHMARK { - for (int i = 0, n = realitems.size(); i != n; ++i) - (void)qHash(realitems.at(i)); + for (int i = 0, n = realitems.size(); i != n; ++i) { + volatile size_t h = qHash(realitems.at(i), Seed); + (void)h; +#ifdef Q_CC_GNU + // "use" h + asm ("" : "+r" (h)); +#endif + } } } diff --git a/tests/benchmarks/corelib/tools/qhash/tst_bench_qhash.h b/tests/benchmarks/corelib/tools/qhash/tst_bench_qhash.h index 7d89f044c4e..501b4a8b7f1 100644 --- a/tests/benchmarks/corelib/tools/qhash/tst_bench_qhash.h +++ b/tests/benchmarks/corelib/tools/qhash/tst_bench_qhash.h @@ -1,8 +1,20 @@ // Copyright (C) 2016 The Qt Company Ltd. // SPDX-License-Identifier: LicenseRef-Qt-Commercial OR GPL-3.0-only +#include #include +struct OwningLatin1String : QByteArray +{ + OwningLatin1String() = default; + OwningLatin1String(const QByteArray &a) : QByteArray(a) {} + OwningLatin1String(QByteArray &&a) : QByteArray(std::move(a)) {} +}; +QT_BEGIN_NAMESPACE +inline size_t qHash(const OwningLatin1String &s, size_t seed = 0) +{ return qHash(QLatin1StringView(s), seed); } +QT_END_NAMESPACE + struct Qt4String : QString { Qt4String() {} @@ -10,7 +22,7 @@ struct Qt4String : QString }; QT_BEGIN_NAMESPACE -size_t qHash(const Qt4String &); +size_t qHash(const Qt4String &, size_t = 0); QT_END_NAMESPACE struct Qt50String : QString @@ -31,6 +43,6 @@ struct JavaString : QString }; QT_BEGIN_NAMESPACE -size_t qHash(const JavaString &); +size_t qHash(const JavaString &, size_t = 0); QT_END_NAMESPACE