Refactor the URL recoder a little

Change it to operate on QChar pointers, which gains a little in
performance. This also avoids unnecessary detaching in the QString
source.

In addition, make the output be appended to an existing QString. This
will be useful later when we're reconstructing a URL from its
components.

Change-Id: I7e2f64028277637bd329af5f98001ace253a50c7
Reviewed-by: Lars Knoll <lars.knoll@nokia.com>
This commit is contained in:
Thiago Macieira 2011-09-08 17:40:36 +02:00 committed by Qt by Nokia
parent 73e16b15a6
commit b75aa795fe
2 changed files with 104 additions and 89 deletions

View File

@ -138,8 +138,8 @@ static inline ushort decodeNibble(ushort c)
// assumes that the range has been checked already // assumes that the range has been checked already
static inline ushort decodePercentEncoding(const ushort *input) static inline ushort decodePercentEncoding(const ushort *input)
{ {
ushort c1 = input[0]; ushort c1 = input[1];
ushort c2 = input[1]; ushort c2 = input[2];
if (!isHex(c1) || !isHex(c2)) if (!isHex(c1) || !isHex(c2))
return ushort(-1); return ushort(-1);
return decodeNibble(c1) << 4 | decodeNibble(c2); return decodeNibble(c1) << 4 | decodeNibble(c2);
@ -151,18 +151,27 @@ static inline ushort encodeNibble(ushort c)
return hexnumbers[c & 0xf]; return hexnumbers[c & 0xf];
} }
static void ensureDetached(QString &result, ushort *&output, const ushort *input, const ushort *end) static void ensureDetached(QString &result, ushort *&output, const ushort *begin, const ushort *input, const ushort *end,
int add = 0)
{ {
if (!output) { if (!output) {
// now detach // now detach
// create enough space if the rest of the string needed to be percent-encoded // create enough space if the rest of the string needed to be percent-encoded
int charsProcessed = input - reinterpret_cast<const ushort *>(result.constData()) - 1; int charsProcessed = input - begin;
int charsRemaining = end - input + 1; int charsRemaining = end - input;
int newSize = result.size() + 2 * charsRemaining; int spaceNeeded = end - begin + 2 * charsRemaining + add;
result.resize(newSize); int origSize = result.size();
result.resize(origSize + spaceNeeded);
// set the output variable // we know that resize() above detached, so we bypass the reference count check
output = reinterpret_cast<ushort *>(result.data()) + charsProcessed; output = const_cast<ushort *>(reinterpret_cast<const ushort *>(result.constData()))
+ origSize;
// copy the chars we've already processed
int i;
for (i = 0; i < charsProcessed; ++i)
output[i] = begin[i];
output += i;
} }
} }
@ -180,7 +189,8 @@ static inline bool isUnicodeNonCharacter(uint ucs4)
} }
// returns true if we performed an UTF-8 decoding // returns true if we performed an UTF-8 decoding
static uint encodedUtf8ToUcs4(QString &result, ushort *&output, const ushort *&input, const ushort *end, ushort decoded) static bool encodedUtf8ToUtf16(QString &result, ushort *&output, const ushort *begin, const ushort *&input,
const ushort *end, ushort decoded)
{ {
int charsNeeded; int charsNeeded;
uint min_uc; uint min_uc;
@ -191,15 +201,15 @@ static uint encodedUtf8ToUcs4(QString &result, ushort *&output, const ushort *&i
// however, all 0xC0 and 0xC1 first bytes can only produce overlong sequences // however, all 0xC0 and 0xC1 first bytes can only produce overlong sequences
return false; return false;
} else if (decoded < 0xe0) { } else if (decoded < 0xe0) {
charsNeeded = 1; charsNeeded = 2;
min_uc = 0x80; min_uc = 0x80;
uc = decoded & 0x1f; uc = decoded & 0x1f;
} else if (decoded < 0xf0) { } else if (decoded < 0xf0) {
charsNeeded = 2; charsNeeded = 3;
min_uc = 0x800; min_uc = 0x800;
uc = decoded & 0x0f; uc = decoded & 0x0f;
} else if (decoded < 0xf5) { } else if (decoded < 0xf5) {
charsNeeded = 3; charsNeeded = 4;
min_uc = 0x10000; min_uc = 0x10000;
uc = decoded & 0x07; uc = decoded & 0x07;
} else { } else {
@ -210,10 +220,10 @@ static uint encodedUtf8ToUcs4(QString &result, ushort *&output, const ushort *&i
} }
// are there enough remaining? // are there enough remaining?
if (end - input < 3*charsNeeded + 2) if (end - input < 3*charsNeeded)
return false; return false;
if (input[2] != '%') if (input[3] != '%')
return false; return false;
// first continuation character // first continuation character
@ -223,8 +233,8 @@ static uint encodedUtf8ToUcs4(QString &result, ushort *&output, const ushort *&i
uc <<= 6; uc <<= 6;
uc |= decoded & 0x3f; uc |= decoded & 0x3f;
if (charsNeeded > 1) { if (charsNeeded > 2) {
if (input[5] != '%') if (input[6] != '%')
return false; return false;
// second continuation character // second continuation character
@ -234,8 +244,8 @@ static uint encodedUtf8ToUcs4(QString &result, ushort *&output, const ushort *&i
uc <<= 6; uc <<= 6;
uc |= decoded & 0x3f; uc |= decoded & 0x3f;
if (charsNeeded > 2) { if (charsNeeded > 3) {
if (input[8] != '%') if (input[9] != '%')
return false; return false;
// third continuation character // third continuation character
@ -253,36 +263,28 @@ static uint encodedUtf8ToUcs4(QString &result, ushort *&output, const ushort *&i
if (isUnicodeNonCharacter(uc) || (uc >= 0xD800 && uc <= 0xDFFF) || uc >= 0x110000) if (isUnicodeNonCharacter(uc) || (uc >= 0xD800 && uc <= 0xDFFF) || uc >= 0x110000)
return false; return false;
// detach if necessary
if (!output) {
// create enough space if the rest of the string needed to be percent-encoded
int charsProcessed = input - reinterpret_cast<const ushort *>(result.constData()) - 1;
int charsRemaining = end - input - 2 - 3*charsNeeded;
int newSize = result.size() + 2 * charsRemaining;
result.resize(newSize);
// set the output variable
output = reinterpret_cast<ushort *>(result.data()) + charsProcessed;
}
if (!QChar::requiresSurrogates(uc)) { if (!QChar::requiresSurrogates(uc)) {
// UTF-8 decoded and no surrogates are required // UTF-8 decoded and no surrogates are required
// detach if necessary
ensureDetached(result, output, begin, input, end, -9 * charsNeeded + 1);
*output++ = uc; *output++ = uc;
} else { } else {
// UTF-8 decoded to something that requires a surrogate pair // UTF-8 decoded to something that requires a surrogate pair
ensureDetached(result, output, begin, input, end, -9 * charsNeeded + 2);
*output++ = QChar::highSurrogate(uc); *output++ = QChar::highSurrogate(uc);
*output++ = QChar::lowSurrogate(uc); *output++ = QChar::lowSurrogate(uc);
} }
input += charsNeeded * 3 + 2; input += charsNeeded * 3 - 1;
return true; return true;
} }
static void unicodeToEncodedUtf8(QString &result, ushort *&output, const ushort *&input, const ushort *end, ushort decoded) static void unicodeToEncodedUtf8(QString &result, ushort *&output, const ushort *begin,
const ushort *&input, const ushort *end, ushort decoded)
{ {
uint uc = decoded; uint uc = decoded;
if (QChar::isHighSurrogate(uc)) { if (QChar::isHighSurrogate(uc)) {
if (QChar::isLowSurrogate(*input)) if (input < end && QChar::isLowSurrogate(input[1]))
uc = QChar::surrogateToUcs4(uc, *input); uc = QChar::surrogateToUcs4(uc, input[1]);
} }
// note: we will encode bad UTF-16 to UTF-8 // note: we will encode bad UTF-16 to UTF-8
@ -293,28 +295,23 @@ static void unicodeToEncodedUtf8(QString &result, ushort *&output, const ushort
// detach // detach
if (!output) { if (!output) {
// create enough space if the rest of the string needed to be percent-encoded // we need 3 * utf8len for the encoded UTF-8 sequence
int charsProcessed = input - reinterpret_cast<const ushort *>(result.constData()) - 1; // but ensureDetached already adds 3 for the char we're processing
int charsRemaining = end - input; ensureDetached(result, output, begin, input, end, 3*utf8len - 3);
int newSize = result.size() + 2 * charsRemaining - 1 + 3*utf8len;
result.resize(newSize);
// set the output variable
output = reinterpret_cast<ushort *>(result.data()) + charsProcessed;
} else { } else {
// verify that there's enough space or expand // verify that there's enough space or expand
int charsRemaining = end - input; int charsRemaining = end - input - 1; // not including this one
int pos = output - reinterpret_cast<const ushort *>(result.constData()); int pos = output - reinterpret_cast<const ushort *>(result.constData());
int spaceRemaining = result.size() - pos; int spaceRemaining = result.size() - pos;
if (spaceRemaining < 3*charsRemaining + 3*utf8len) { if (spaceRemaining < 3*charsRemaining + 3*utf8len) {
// must resize // must resize
result.resize(result.size() + 3*utf8len); result.resize(result.size() + 3*utf8len);
output = reinterpret_cast<ushort *>(result.data()) + pos;
}
}
if (QChar::requiresSurrogates(uc)) // we know that resize() above detached, so we bypass the reference count check
++input; output = const_cast<ushort *>(reinterpret_cast<const ushort *>(result.constData()));
output += pos;
}
}
// write the sequence // write the sequence
if (uc < 0x800) { if (uc < 0x800) {
@ -337,6 +334,9 @@ static void unicodeToEncodedUtf8(QString &result, ushort *&output, const ushort
*output++ = '%'; *output++ = '%';
*output++ = encodeNibble(c >> 4); *output++ = encodeNibble(c >> 4);
*output++ = encodeNibble(c & 0xf); *output++ = encodeNibble(c & 0xf);
// this was a surrogate pair
++input;
} else { } else {
// first of three bytes // first of three bytes
c = 0xe0 | uchar(uc >> 12); c = 0xe0 | uchar(uc >> 12);
@ -359,22 +359,21 @@ static void unicodeToEncodedUtf8(QString &result, ushort *&output, const ushort
*output++ = encodeNibble(c & 0xf); *output++ = encodeNibble(c & 0xf);
} }
static QString recode(const QString &component, QUrl::ComponentFormattingOptions encoding, static int recode(QString &result, const ushort *begin, const ushort *end, QUrl::ComponentFormattingOptions encoding,
const uchar *actionTable, bool retryBadEncoding) const uchar *actionTable, bool retryBadEncoding)
{ {
QString result = component; const int origSize = result.size();
const ushort *input = reinterpret_cast<const ushort *>(component.constData()); const ushort *input = begin;
const ushort * const end = input + component.length();
ushort *output = 0; ushort *output = 0;
while (input != end) { for ( ; input != end; ++input) {
register ushort c; register ushort c;
EncodingAction action; EncodingAction action;
// try a run where no change is necessary // try a run where no change is necessary
while (input != end) { for ( ; input != end; ++input) {
c = *input++; c = *input;
if (c < 0x20 || c >= 0x80) // also: (c - 0x20 < 0x60U) if (c < 0x20U || c >= 0x80U) // also: (c - 0x20 < 0x60U)
goto non_trivial; goto non_trivial;
action = EncodingAction(actionTable[c - ' ']); action = EncodingAction(actionTable[c - ' ']);
if (action == EncodeCharacter) if (action == EncodeCharacter)
@ -388,23 +387,23 @@ non_trivial:
register uint decoded; register uint decoded;
if (c == '%' && retryBadEncoding) { if (c == '%' && retryBadEncoding) {
// always write "%25" // always write "%25"
ensureDetached(result, output, input, end); ensureDetached(result, output, begin, input, end);
*output++ = '%'; *output++ = '%';
*output++ = '2'; *output++ = '2';
*output++ = '5'; *output++ = '5';
continue; continue;
} else if (c == '%') { } else if (c == '%') {
// check if the input is valid // check if the input is valid
if (input + 1 >= end || (decoded = decodePercentEncoding(input)) == ushort(-1)) { if (input + 2 >= end || (decoded = decodePercentEncoding(input)) == ushort(-1)) {
// not valid, retry // not valid, retry
result.clear(); result.resize(origSize);
return recode(component, encoding, actionTable, true); return recode(result, begin, end, encoding, actionTable, true);
} }
if (decoded >= 0x80) { if (decoded >= 0x80) {
// decode the UTF-8 sequence // decode the UTF-8 sequence
if (encoding & QUrl::DecodeUnicode && if (encoding & QUrl::DecodeUnicode &&
encodedUtf8ToUcs4(result, output, input, end, decoded)) encodedUtf8ToUtf16(result, output, begin, input, end, decoded))
continue; continue;
// decoding the encoded UTF-8 failed // decoding the encoded UTF-8 failed
@ -416,7 +415,7 @@ non_trivial:
decoded = c; decoded = c;
if (decoded >= 0x80 && (encoding & QUrl::DecodeUnicode) == 0) { if (decoded >= 0x80 && (encoding & QUrl::DecodeUnicode) == 0) {
// encode the UTF-8 sequence // encode the UTF-8 sequence
unicodeToEncodedUtf8(result, output, input, end, decoded); unicodeToEncodedUtf8(result, output, begin, input, end, decoded);
continue; continue;
} else if (decoded >= 0x80) { } else if (decoded >= 0x80) {
if (output) if (output)
@ -437,34 +436,37 @@ non_trivial:
if (c == '%' && action != DecodeCharacter) { if (c == '%' && action != DecodeCharacter) {
// cases 5 and 6: it's encoded and we're leaving it as it is // cases 5 and 6: it's encoded and we're leaving it as it is
// except we're pedantic and we'll uppercase the hex // except we're pedantic and we'll uppercase the hex
if (output || !isUpperHex(input[0]) || !isUpperHex(input[1])) { if (output || !isUpperHex(input[1]) || !isUpperHex(input[2])) {
ensureDetached(result, output, input, end); ensureDetached(result, output, begin, input, end);
*output++ = '%'; *output++ = '%';
*output++ = toUpperHex(*input++); *output++ = toUpperHex(*++input);
*output++ = toUpperHex(*input++); *output++ = toUpperHex(*++input);
} }
} else if (c == '%' && action == DecodeCharacter) { } else if (c == '%' && action == DecodeCharacter) {
// case 4: we need to decode // case 4: we need to decode
ensureDetached(result, output, input, end); ensureDetached(result, output, begin, input, end);
*output++ = decoded; *output++ = decoded;
input += 2; input += 2;
} else { } else {
// must be case 3: we need to encode // must be case 3: we need to encode
ensureDetached(result, output, input, end); ensureDetached(result, output, begin, input, end);
*output++ = '%'; *output++ = '%';
*output++ = encodeNibble(c >> 4); *output++ = encodeNibble(c >> 4);
*output++ = encodeNibble(c & 0xf); *output++ = encodeNibble(c & 0xf);
} }
} }
if (output) if (output) {
result.truncate(output - reinterpret_cast<const ushort *>(result.constData())); int len = output - reinterpret_cast<const ushort *>(result.constData());
return result; result.truncate(len);
return len - origSize;
}
return 0;
} }
Q_AUTOTEST_EXPORT QString Q_AUTOTEST_EXPORT int
qt_urlRecode(const QString &component, QUrl::ComponentFormattingOptions encoding, qt_urlRecode(QString &appendTo, const QChar *begin, const QChar *end,
const ushort *tableModifications) QUrl::ComponentFormattingOptions encoding, const ushort *tableModifications)
{ {
uchar actionTable[sizeof defaultActionTable]; uchar actionTable[sizeof defaultActionTable];
if (encoding & QUrl::DecodeAllDelimiters) { if (encoding & QUrl::DecodeAllDelimiters) {
@ -487,7 +489,8 @@ qt_urlRecode(const QString &component, QUrl::ComponentFormattingOptions encoding
actionTable[uchar(*p) - ' '] = *p >> 8; actionTable[uchar(*p) - ' '] = *p >> 8;
} }
return recode(component, encoding, actionTable, false); return recode(appendTo, reinterpret_cast<const ushort *>(begin), reinterpret_cast<const ushort *>(end),
encoding, actionTable, false);
} }
QT_END_NAMESPACE QT_END_NAMESPACE

View File

@ -50,8 +50,8 @@ Q_CORE_EXPORT extern void qt_nameprep(QString *source, int from);
Q_CORE_EXPORT extern bool qt_check_std3rules(const QChar *, int); Q_CORE_EXPORT extern bool qt_check_std3rules(const QChar *, int);
Q_CORE_EXPORT void qt_punycodeEncoder(const QChar *s, int ucLength, QString *output); Q_CORE_EXPORT void qt_punycodeEncoder(const QChar *s, int ucLength, QString *output);
Q_CORE_EXPORT QString qt_punycodeDecoder(const QString &pc); Q_CORE_EXPORT QString qt_punycodeDecoder(const QString &pc);
Q_CORE_EXPORT QString qt_urlRecode(const QString &component, QUrl::ComponentFormattingOptions encoding, Q_CORE_EXPORT int qt_urlRecode(QString &appendTo, const QChar *input, const QChar *end,
const ushort *tableModifications = 0); QUrl::ComponentFormattingOptions encoding, const ushort *tableModifications = 0);
QT_END_NAMESPACE QT_END_NAMESPACE
// For testsuites // For testsuites
@ -762,7 +762,6 @@ void tst_QUrlInternal::correctEncodedMistakes_data()
QTest::addColumn<QString>("input"); QTest::addColumn<QString>("input");
QTest::addColumn<QString>("expected"); QTest::addColumn<QString>("expected");
QTest::newRow("null") << QString() << QString();
QTest::newRow("empty") << "" << ""; QTest::newRow("empty") << "" << "";
// these contain one invalid percent // these contain one invalid percent
@ -808,9 +807,13 @@ void tst_QUrlInternal::correctEncodedMistakes()
QFETCH(QString, input); QFETCH(QString, input);
QFETCH(QString, expected); QFETCH(QString, expected);
QString output = qt_urlRecode(input, QUrl::DecodeUnicode); // prepend some data to be sure that it remains there
QString output = QTest::currentDataTag();
expected.prepend(output);
if (!qt_urlRecode(output, input.constData(), input.constData() + input.length(), QUrl::DecodeUnicode))
output += input;
QCOMPARE(output, expected); QCOMPARE(output, expected);
QCOMPARE(output.isNull(), expected.isNull());
} }
static void addUtf8Data(const char *name, const char *data) static void addUtf8Data(const char *name, const char *data)
@ -893,7 +896,7 @@ void tst_QUrlInternal::encodingRecode_data()
addUtf8Data("utf8-3char-2", "\xED\x9F\xBF"); // U+D7FF addUtf8Data("utf8-3char-2", "\xED\x9F\xBF"); // U+D7FF
addUtf8Data("utf8-3char-3", "\xEE\x80\x80"); // U+E000 addUtf8Data("utf8-3char-3", "\xEE\x80\x80"); // U+E000
addUtf8Data("utf8-3char-4", "\xEF\xBF\xBD"); // U+FFFD addUtf8Data("utf8-3char-4", "\xEF\xBF\xBD"); // U+FFFD
addUtf8Data("utf8-2char-1", "\xF0\x90\x80\x80"); // U+10000 addUtf8Data("utf8-4char-1", "\xF0\x90\x80\x80"); // U+10000
addUtf8Data("utf8-4char-2", "\xF4\x8F\xBF\xBD"); // U+10FFFD addUtf8Data("utf8-4char-2", "\xF4\x8F\xBF\xBD"); // U+10FFFD
// longer UTF-8 sequences, mixed with unreserved // longer UTF-8 sequences, mixed with unreserved
@ -931,9 +934,13 @@ void tst_QUrlInternal::encodingRecode()
QFETCH(QString, expected); QFETCH(QString, expected);
QFETCH(QUrl::ComponentFormattingOptions, encodingMode); QFETCH(QUrl::ComponentFormattingOptions, encodingMode);
QString output = qt_urlRecode(input, encodingMode); // prepend some data to be sure that it remains there
QString output = QTest::currentDataTag();
expected.prepend(output);
if (!qt_urlRecode(output, input.constData(), input.constData() + input.length(), encodingMode))
output += input;
QCOMPARE(output, expected); QCOMPARE(output, expected);
QCOMPARE(output.isNull(), expected.isNull());
} }
void tst_QUrlInternal::encodingRecodeInvalidUtf8_data() void tst_QUrlInternal::encodingRecodeInvalidUtf8_data()
@ -957,13 +964,18 @@ void tst_QUrlInternal::encodingRecodeInvalidUtf8()
QFETCH(QByteArray, utf8); QFETCH(QByteArray, utf8);
QString input = utf8.toPercentEncoding(); QString input = utf8.toPercentEncoding();
QString output; // prepend some data to be sure that it remains there
output = qt_urlRecode(input, QUrl::DecodeUnicode); QString output = QTest::currentDataTag();
QCOMPARE(output, input);
if (!qt_urlRecode(output, input.constData(), input.constData() + input.length(), QUrl::DecodeUnicode))
output += input;
QCOMPARE(output, QTest::currentDataTag() + input);
// this is just control // this is just control
output = qt_urlRecode(input, QUrl::FullyEncoded); output = QTest::currentDataTag();
QCOMPARE(output, input); if (!qt_urlRecode(output, input.constData(), input.constData() + input.length(), QUrl::FullyEncoded))
output += input;
QCOMPARE(output, QTest::currentDataTag() + input);
} }
QTEST_APPLESS_MAIN(tst_QUrlInternal) QTEST_APPLESS_MAIN(tst_QUrlInternal)