rcc: de-duplicate data in resources

content based de-duplications by SHA256 hashing with full data
check if candidates based on the hash value are found

Task-number: QTBUG-126168
Change-Id: Ifebc8ca322e354d8ea1f701f27f3f65916f7555c
Reviewed-by: hjk <hjk@qt.io>
This commit is contained in:
Christoph Cullmann 2024-06-11 18:11:36 +02:00
parent a8b7da59cb
commit 607b3b2feb
7 changed files with 238 additions and 8 deletions

View File

@ -1,10 +1,12 @@
// Copyright (C) 2018 The Qt Company Ltd.
// Copyright (C) 2018 Intel Corporation.
// Copyright (C) 2024 Christoph Cullmann <christoph@cullmann.io>
// SPDX-License-Identifier: LicenseRef-Qt-Commercial OR GPL-3.0-only WITH Qt-GPL-exception-1.0
#include "rcc.h"
#include <qbytearray.h>
#include <qcryptographichash.h>
#include <qdatetime.h>
#include <qdebug.h>
#include <qdir.h>
@ -90,8 +92,28 @@ public:
QString resourceName() const;
struct DeduplicationKey {
RCCResourceLibrary::CompressionAlgorithm compressAlgo;
int compressLevel;
int compressThreshold;
QByteArray hash;
bool operator==(const DeduplicationKey &other) const
{
return compressAlgo == other.compressAlgo &&
compressLevel == other.compressLevel &&
compressThreshold == other.compressThreshold &&
hash == other.hash;
}
};
typedef QMultiHash<DeduplicationKey, RCCFileInfo*> DeduplicationMultiHash;
public:
qint64 writeDataBlob(RCCResourceLibrary &lib, qint64 offset, QString *errorMessage);
qint64 writeDataBlob(RCCResourceLibrary &lib,
qint64 offset,
DeduplicationMultiHash &dedupByContent,
QString *errorMessage);
qint64 writeDataName(RCCResourceLibrary &, qint64 offset);
void writeDataInfo(RCCResourceLibrary &lib);
@ -114,6 +136,11 @@ public:
qint64 m_childOffset = 0;
};
static size_t qHash(const RCCFileInfo::DeduplicationKey &key, size_t seed) noexcept
{
return qHashMulti(seed, key.compressAlgo, key.compressLevel, key.compressThreshold, key.hash);
}
RCCFileInfo::RCCFileInfo(const QString &name, const QFileInfo &fileInfo, QLocale::Language language,
QLocale::Territory territory, uint flags,
RCCResourceLibrary::CompressionAlgorithm compressAlgo, int compressLevel,
@ -217,8 +244,10 @@ void RCCFileInfo::writeDataInfo(RCCResourceLibrary &lib)
}
}
qint64 RCCFileInfo::writeDataBlob(RCCResourceLibrary &lib, qint64 offset,
QString *errorMessage)
qint64 RCCFileInfo::writeDataBlob(RCCResourceLibrary &lib,
qint64 offset,
DeduplicationMultiHash &dedupByContent,
QString *errorMessage)
{
const bool text = lib.m_format == RCCResourceLibrary::C_Code;
const bool pass1 = lib.m_format == RCCResourceLibrary::Pass1;
@ -231,14 +260,38 @@ qint64 RCCFileInfo::writeDataBlob(RCCResourceLibrary &lib, qint64 offset,
QByteArray data;
if (!m_isEmpty) {
//find the data to be written
QFile file(m_fileInfo.absoluteFilePath());
// find the data to be written
const QString absoluteFilePath = m_fileInfo.absoluteFilePath();
QFile file(absoluteFilePath);
if (!file.open(QFile::ReadOnly)) {
*errorMessage = msgOpenReadFailed(m_fileInfo.absoluteFilePath(), file.errorString());
*errorMessage = msgOpenReadFailed(absoluteFilePath, file.errorString());
return 0;
}
data = file.readAll();
// de-duplicate the same file content, we can re-use already written data
// we only do that if we have the same compression settings
const QByteArray hash = QCryptographicHash::hash(data, QCryptographicHash::Sha256);
const DeduplicationKey key{m_compressAlgo, m_compressLevel, m_compressThreshold, hash};
const QList<RCCFileInfo *> potentialCandidates = dedupByContent.values(key);
for (const RCCFileInfo *candidate : potentialCandidates) {
// check real content, we can have collisions
QFile candidateFile(candidate->m_fileInfo.absoluteFilePath());
if (!candidateFile.open(QFile::ReadOnly)) {
*errorMessage = msgOpenReadFailed(candidate->m_fileInfo.absoluteFilePath(),
candidateFile.errorString());
return 0;
}
if (data != candidateFile.readAll()) {
continue;
}
// just remember the offset & flags with final compression state
// of the already written data and be done
m_dataOffset = candidate->m_dataOffset;
m_flags = candidate->m_flags;
return offset;
}
dedupByContent.insert(key, this);
}
// Check if compression is useful for this file
@ -1168,6 +1221,7 @@ bool RCCResourceLibrary::writeDataBlobs()
QStack<RCCFileInfo*> pending;
pending.push(m_root);
qint64 offset = 0;
RCCFileInfo::DeduplicationMultiHash dedupByContent;
QString errorMessage;
while (!pending.isEmpty()) {
RCCFileInfo *file = pending.pop();
@ -1176,7 +1230,8 @@ bool RCCResourceLibrary::writeDataBlobs()
if (child->m_flags & RCCFileInfo::Directory)
pending.push(child);
else {
offset = child->writeDataBlob(*this, offset, &errorMessage);
offset = child->writeDataBlob(*this, offset,
dedupByContent, &errorMessage);
if (offset == 0) {
m_errorDevice->write(errorMessage.toUtf8());
return false;

View File

@ -0,0 +1,157 @@
/****************************************************************************
** Resource object code
**
IGNORE:** Created by: The Resource Compiler for Qt version 6.9.0
**
** WARNING! All changes made in this file will be lost!
*****************************************************************************/
#ifdef _MSC_VER
// disable informational message "function ... selected for automatic inline expansion"
#pragma warning (disable: 4711)
#endif
static const unsigned char qt_resource_data[] = {
// b.txt
0x0,0x0,0x0,0xb,
0x62,
0x20,0x74,0x65,0x73,0x74,0x20,0x66,0x69,0x6c,0x65,
// c_with_a_content.txt
0x0,0x0,0x0,0xb,
0x61,
0x20,0x74,0x65,0x73,0x74,0x20,0x66,0x69,0x6c,0x65,
// b.txt
0x0,0x0,0x0,0xb,
0x62,
0x20,0x74,0x65,0x73,0x74,0x20,0x66,0x69,0x6c,0x65,
};
static const unsigned char qt_resource_name[] = {
// files
0x0,0x5,
0x0,0x6d,0x2,0xc3,
0x0,0x66,
0x0,0x69,0x0,0x6c,0x0,0x65,0x0,0x73,
// b.txt
0x0,0x5,
0x0,0x65,0x5b,0xf4,
0x0,0x62,
0x0,0x2e,0x0,0x74,0x0,0x78,0x0,0x74,
// c_with_a_content.txt
0x0,0x14,
0x1,0x61,0x1d,0x34,
0x0,0x63,
0x0,0x5f,0x0,0x77,0x0,0x69,0x0,0x74,0x0,0x68,0x0,0x5f,0x0,0x61,0x0,0x5f,0x0,0x63,0x0,0x6f,0x0,0x6e,0x0,0x74,0x0,0x65,0x0,0x6e,0x0,0x74,0x0,0x2e,
0x0,0x74,0x0,0x78,0x0,0x74,
// a.txt
0x0,0x5,
0x0,0x64,0x5b,0xf4,
0x0,0x61,
0x0,0x2e,0x0,0x74,0x0,0x78,0x0,0x74,
// alias_of_b_compress9.txt
0x0,0x18,
0xb,0x26,0xf,0xb4,
0x0,0x61,
0x0,0x6c,0x0,0x69,0x0,0x61,0x0,0x73,0x0,0x5f,0x0,0x6f,0x0,0x66,0x0,0x5f,0x0,0x62,0x0,0x5f,0x0,0x63,0x0,0x6f,0x0,0x6d,0x0,0x70,0x0,0x72,0x0,0x65,
0x0,0x73,0x0,0x73,0x0,0x39,0x0,0x2e,0x0,0x74,0x0,0x78,0x0,0x74,
// alias_of_b.txt
0x0,0xe,
0x1,0xa4,0x6d,0x34,
0x0,0x61,
0x0,0x6c,0x0,0x69,0x0,0x61,0x0,0x73,0x0,0x5f,0x0,0x6f,0x0,0x66,0x0,0x5f,0x0,0x62,0x0,0x2e,0x0,0x74,0x0,0x78,0x0,0x74,
// alias_of_b_compress9_dupe.txt
0x0,0x1d,
0x9,0x4,0x7a,0x14,
0x0,0x61,
0x0,0x6c,0x0,0x69,0x0,0x61,0x0,0x73,0x0,0x5f,0x0,0x6f,0x0,0x66,0x0,0x5f,0x0,0x62,0x0,0x5f,0x0,0x63,0x0,0x6f,0x0,0x6d,0x0,0x70,0x0,0x72,0x0,0x65,
0x0,0x73,0x0,0x73,0x0,0x39,0x0,0x5f,0x0,0x64,0x0,0x75,0x0,0x70,0x0,0x65,0x0,0x2e,0x0,0x74,0x0,0x78,0x0,0x74,
};
static const unsigned char qt_resource_struct[] = {
// :
0x0,0x0,0x0,0x0,0x0,0x2,0x0,0x0,0x0,0x1,0x0,0x0,0x0,0x1,
0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,
// :/files
0x0,0x0,0x0,0x0,0x0,0x2,0x0,0x0,0x0,0x6,0x0,0x0,0x0,0x2,
0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,
// :/files/a.txt
0x0,0x0,0x0,0x4e,0x0,0x0,0x0,0x0,0x0,0x1,0x0,0x0,0x0,0xf,
TIMESTAMP:files/a.txt
// :/files/b.txt
0x0,0x0,0x0,0x10,0x0,0x0,0x0,0x0,0x0,0x1,0x0,0x0,0x0,0x0,
TIMESTAMP:files/b.txt
// :/files/c_with_a_content.txt
0x0,0x0,0x0,0x20,0x0,0x0,0x0,0x0,0x0,0x1,0x0,0x0,0x0,0xf,
TIMESTAMP:files/c_with_a_content.txt
// :/files/alias_of_b.txt
0x0,0x0,0x0,0x94,0x0,0x0,0x0,0x0,0x0,0x1,0x0,0x0,0x0,0x0,
TIMESTAMP:files/b.txt
// :/files/alias_of_b_compress9_dupe.txt
0x0,0x0,0x0,0xb6,0x0,0x0,0x0,0x0,0x0,0x1,0x0,0x0,0x0,0x1e,
TIMESTAMP:files/b.txt
// :/files/alias_of_b_compress9.txt
0x0,0x0,0x0,0x5e,0x0,0x0,0x0,0x0,0x0,0x1,0x0,0x0,0x0,0x1e,
TIMESTAMP:files/b.txt
};
#ifdef QT_NAMESPACE
# define QT_RCC_PREPEND_NAMESPACE(name) ::QT_NAMESPACE::name
# define QT_RCC_MANGLE_NAMESPACE0(x) x
# define QT_RCC_MANGLE_NAMESPACE1(a, b) a##_##b
# define QT_RCC_MANGLE_NAMESPACE2(a, b) QT_RCC_MANGLE_NAMESPACE1(a,b)
# define QT_RCC_MANGLE_NAMESPACE(name) QT_RCC_MANGLE_NAMESPACE2( \
QT_RCC_MANGLE_NAMESPACE0(name), QT_RCC_MANGLE_NAMESPACE0(QT_NAMESPACE))
#else
# define QT_RCC_PREPEND_NAMESPACE(name) name
# define QT_RCC_MANGLE_NAMESPACE(name) name
#endif
#if defined(QT_INLINE_NAMESPACE)
inline namespace QT_NAMESPACE {
#elif defined(QT_NAMESPACE)
namespace QT_NAMESPACE {
#endif
bool qRegisterResourceData(int, const unsigned char *, const unsigned char *, const unsigned char *);
bool qUnregisterResourceData(int, const unsigned char *, const unsigned char *, const unsigned char *);
#ifdef QT_NAMESPACE
}
#endif
int QT_RCC_MANGLE_NAMESPACE(qInitResources)();
int QT_RCC_MANGLE_NAMESPACE(qInitResources)()
{
int version = 3;
QT_RCC_PREPEND_NAMESPACE(qRegisterResourceData)
(version, qt_resource_struct, qt_resource_name, qt_resource_data);
return 1;
}
int QT_RCC_MANGLE_NAMESPACE(qCleanupResources)();
int QT_RCC_MANGLE_NAMESPACE(qCleanupResources)()
{
int version = 3;
QT_RCC_PREPEND_NAMESPACE(qUnregisterResourceData)
(version, qt_resource_struct, qt_resource_name, qt_resource_data);
return 1;
}
#ifdef __clang__
# pragma clang diagnostic push
# pragma clang diagnostic ignored "-Wexit-time-destructors"
#endif
namespace {
struct initializer {
initializer() { QT_RCC_MANGLE_NAMESPACE(qInitResources)(); }
~initializer() { QT_RCC_MANGLE_NAMESPACE(qCleanupResources)(); }
} dummy;
}
#ifdef __clang__
# pragma clang diagnostic pop
#endif

View File

@ -0,0 +1,10 @@
<!DOCTYPE RCC><RCC version="1.0">
<qresource>
<file>files/a.txt</file>
<file>files/b.txt</file>
<file alias="files/alias_of_b.txt">files/b.txt</file>
<file>files/c_with_a_content.txt</file>
<file alias="files/alias_of_b_compress9.txt" compress="9">files/b.txt</file>
<file alias="files/alias_of_b_compress9_dupe.txt" compress="9">files/b.txt</file>
</qresource>
</RCC>

View File

@ -0,0 +1 @@
a test file

View File

@ -0,0 +1 @@
b test file

View File

@ -0,0 +1 @@
a test file

View File

@ -152,6 +152,11 @@ void tst_rcc::rcc_data()
QTest::newRow("legal") << m_dataPath + QLatin1StringView("/legal")
<< "legal.qrc" << "rcc_legal.cpp";
if (sizeof(size_t) == 8) {
const QString deduplicationPath = m_dataPath + QLatin1String("/deduplication");
QTest::newRow("deduplication") << deduplicationPath << "deduplication.qrc" << "deduplication.expected";
}
}
static QStringList readLinesFromFile(const QString &fileName,