Refactor the URL recoder a little

Change it to operate on QChar pointers, which gains a little in performance. This also avoids unnecessary detaching in the QString source. In addition, make the output be appended to an existing QString. This will be useful later when we're reconstructing a URL from its components. Change-Id: I7e2f64028277637bd329af5f98001ace253a50c7 Reviewed-by: Lars Knoll <lars.knoll@nokia.com>
2011-09-08 17:40:36 +02:00 · 2011-09-08 17:40:36 +02:00 · b75aa795fe
commit b75aa795fe
parent 73e16b15a6
2 changed files with 104 additions and 89 deletions
--- a/src/corelib/io/qurlrecode.cpp
+++ b/src/corelib/io/qurlrecode.cpp
@ -138,8 +138,8 @@ static inline ushort decodeNibble(ushort c)
 // assumes that the range has been checked already
 static inline ushort decodePercentEncoding(const ushort *input)
 {
-    ushort c1 = input[0];
-    ushort c2 = input[1];
+    ushort c1 = input[1];
+    ushort c2 = input[2];
    if (!isHex(c1) || !isHex(c2))
        return ushort(-1);
    return decodeNibble(c1) << 4 | decodeNibble(c2);
@ -151,18 +151,27 @@ static inline ushort encodeNibble(ushort c)
    return hexnumbers[c & 0xf];
 }

-static void ensureDetached(QString &result, ushort *&output, const ushort *input, const ushort *end)
+static void ensureDetached(QString &result, ushort *&output, const ushort *begin, const ushort *input, const ushort *end,
+                           int add = 0)
 {
    if (!output) {
        // now detach
        // create enough space if the rest of the string needed to be percent-encoded
-        int charsProcessed = input - reinterpret_cast<const ushort *>(result.constData()) - 1;
-        int charsRemaining = end - input + 1;
-        int newSize = result.size() + 2 * charsRemaining;
-        result.resize(newSize);
+        int charsProcessed = input - begin;
+        int charsRemaining = end - input;
+        int spaceNeeded = end - begin + 2 * charsRemaining + add;
+        int origSize = result.size();
+        result.resize(origSize + spaceNeeded);

-        // set the output variable
-        output = reinterpret_cast<ushort *>(result.data()) + charsProcessed;
+        // we know that resize() above detached, so we bypass the reference count check
+        output = const_cast<ushort *>(reinterpret_cast<const ushort *>(result.constData()))
+                 + origSize;
+
+        // copy the chars we've already processed
+        int i;
+        for (i = 0; i < charsProcessed; ++i)
+            output[i] = begin[i];
+        output += i;
    }
 }

@ -180,7 +189,8 @@ static inline bool isUnicodeNonCharacter(uint ucs4)
 }

 // returns true if we performed an UTF-8 decoding
-static uint encodedUtf8ToUcs4(QString &result, ushort *&output, const ushort *&input, const ushort *end, ushort decoded)
+static bool encodedUtf8ToUtf16(QString &result, ushort *&output, const ushort *begin, const ushort *&input,
+                               const ushort *end, ushort decoded)
 {
    int charsNeeded;
    uint min_uc;
@ -191,15 +201,15 @@ static uint encodedUtf8ToUcs4(QString &result, ushort *&output, const ushort *&i
        // however, all 0xC0 and 0xC1 first bytes can only produce overlong sequences
        return false;
    } else if (decoded < 0xe0) {
-        charsNeeded = 1;
+        charsNeeded = 2;
        min_uc = 0x80;
        uc = decoded & 0x1f;
    } else if (decoded < 0xf0) {
-        charsNeeded = 2;
+        charsNeeded = 3;
        min_uc = 0x800;
        uc = decoded & 0x0f;
    } else if (decoded < 0xf5) {
-        charsNeeded = 3;
+        charsNeeded = 4;
        min_uc = 0x10000;
        uc = decoded & 0x07;
    } else {
@ -210,10 +220,10 @@ static uint encodedUtf8ToUcs4(QString &result, ushort *&output, const ushort *&i
    }

    // are there enough remaining?
-    if (end - input < 3*charsNeeded + 2)
+    if (end - input < 3*charsNeeded)
        return false;

-    if (input[2] != '%')
+    if (input[3] != '%')
        return false;

    // first continuation character
@ -223,8 +233,8 @@ static uint encodedUtf8ToUcs4(QString &result, ushort *&output, const ushort *&i
    uc <<= 6;
    uc |= decoded & 0x3f;

-    if (charsNeeded > 1) {
-        if (input[5] != '%')
+    if (charsNeeded > 2) {
+        if (input[6] != '%')
            return false;

        // second continuation character
@ -234,8 +244,8 @@ static uint encodedUtf8ToUcs4(QString &result, ushort *&output, const ushort *&i
        uc <<= 6;
        uc |= decoded & 0x3f;

-        if (charsNeeded > 2) {
-            if (input[8] != '%')
+        if (charsNeeded > 3) {
+            if (input[9] != '%')
                return false;

            // third continuation character
@ -253,36 +263,28 @@ static uint encodedUtf8ToUcs4(QString &result, ushort *&output, const ushort *&i
    if (isUnicodeNonCharacter(uc) || (uc >= 0xD800 && uc <= 0xDFFF) || uc >= 0x110000)
        return false;

-    // detach if necessary
-    if (!output) {
-        // create enough space if the rest of the string needed to be percent-encoded
-        int charsProcessed = input - reinterpret_cast<const ushort *>(result.constData()) - 1;
-        int charsRemaining = end - input - 2 - 3*charsNeeded;
-        int newSize = result.size() + 2 * charsRemaining;
-        result.resize(newSize);
-
-        // set the output variable
-        output = reinterpret_cast<ushort *>(result.data()) + charsProcessed;
-    }
-
    if (!QChar::requiresSurrogates(uc)) {
        // UTF-8 decoded and no surrogates are required
+        // detach if necessary
+        ensureDetached(result, output, begin, input, end, -9 * charsNeeded + 1);
        *output++ = uc;
    } else {
        // UTF-8 decoded to something that requires a surrogate pair
+        ensureDetached(result, output, begin, input, end, -9 * charsNeeded + 2);
        *output++ = QChar::highSurrogate(uc);
        *output++ = QChar::lowSurrogate(uc);
    }
-    input += charsNeeded * 3 + 2;
+    input += charsNeeded * 3 - 1;
    return true;
 }

-static void unicodeToEncodedUtf8(QString &result, ushort *&output, const ushort *&input, const ushort *end, ushort decoded)
+static void unicodeToEncodedUtf8(QString &result, ushort *&output, const ushort *begin,
+                                 const ushort *&input, const ushort *end, ushort decoded)
 {
    uint uc = decoded;
    if (QChar::isHighSurrogate(uc)) {
-        if (QChar::isLowSurrogate(*input))
-            uc = QChar::surrogateToUcs4(uc, *input);
+        if (input < end && QChar::isLowSurrogate(input[1]))
+            uc = QChar::surrogateToUcs4(uc, input[1]);
    }

    // note: we will encode bad UTF-16 to UTF-8
@ -293,29 +295,24 @@ static void unicodeToEncodedUtf8(QString &result, ushort *&output, const ushort

    // detach
    if (!output) {
-        // create enough space if the rest of the string needed to be percent-encoded
-        int charsProcessed = input - reinterpret_cast<const ushort *>(result.constData()) - 1;
-        int charsRemaining = end - input;
-        int newSize = result.size() + 2 * charsRemaining - 1 + 3*utf8len;
-        result.resize(newSize);
-
-        // set the output variable
-        output = reinterpret_cast<ushort *>(result.data()) + charsProcessed;
+        // we need 3 * utf8len for the encoded UTF-8 sequence
+        // but ensureDetached already adds 3 for the char we're processing
+        ensureDetached(result, output, begin, input, end, 3*utf8len - 3);
    } else {
        // verify that there's enough space or expand
-        int charsRemaining = end - input;
+        int charsRemaining = end - input - 1; // not including this one
        int pos = output - reinterpret_cast<const ushort *>(result.constData());
        int spaceRemaining = result.size() - pos;
        if (spaceRemaining < 3*charsRemaining + 3*utf8len) {
            // must resize
            result.resize(result.size() + 3*utf8len);
-            output = reinterpret_cast<ushort *>(result.data()) + pos;
+
+            // we know that resize() above detached, so we bypass the reference count check
+            output = const_cast<ushort *>(reinterpret_cast<const ushort *>(result.constData()));
+            output += pos;
        }
    }

-    if (QChar::requiresSurrogates(uc))
-        ++input;
-
    // write the sequence
    if (uc < 0x800) {
        // first of two bytes
@ -337,6 +334,9 @@ static void unicodeToEncodedUtf8(QString &result, ushort *&output, const ushort
            *output++ = '%';
            *output++ = encodeNibble(c >> 4);
            *output++ = encodeNibble(c & 0xf);
+
+            // this was a surrogate pair
+            ++input;
        } else {
            // first of three bytes
            c = 0xe0 | uchar(uc >> 12);
@ -359,22 +359,21 @@ static void unicodeToEncodedUtf8(QString &result, ushort *&output, const ushort
    *output++ = encodeNibble(c & 0xf);
 }

-static QString recode(const QString &component, QUrl::ComponentFormattingOptions encoding,
-                      const uchar *actionTable, bool retryBadEncoding)
+static int recode(QString &result, const ushort *begin, const ushort *end, QUrl::ComponentFormattingOptions encoding,
+                  const uchar *actionTable, bool retryBadEncoding)
 {
-    QString result = component;
-    const ushort *input = reinterpret_cast<const ushort *>(component.constData());
-    const ushort * const end = input + component.length();
+    const int origSize = result.size();
+    const ushort *input = begin;
    ushort *output = 0;

-    while (input != end) {
+    for ( ; input != end; ++input) {
        register ushort c;
        EncodingAction action;

        // try a run where no change is necessary
-        while (input != end) {
-            c = *input++;
-            if (c < 0x20 || c >= 0x80) // also: (c - 0x20 < 0x60U)
+        for ( ; input != end; ++input) {
+            c = *input;
+            if (c < 0x20U || c >= 0x80U) // also: (c - 0x20 < 0x60U)
                goto non_trivial;
            action = EncodingAction(actionTable[c - ' ']);
            if (action == EncodeCharacter)
@ -388,23 +387,23 @@ non_trivial:
        register uint decoded;
        if (c == '%' && retryBadEncoding) {
            // always write "%25"
-            ensureDetached(result, output, input, end);
+            ensureDetached(result, output, begin, input, end);
            *output++ = '%';
            *output++ = '2';
            *output++ = '5';
            continue;
        } else if (c == '%') {
            // check if the input is valid
-            if (input + 1 >= end || (decoded = decodePercentEncoding(input)) == ushort(-1)) {
+            if (input + 2 >= end || (decoded = decodePercentEncoding(input)) == ushort(-1)) {
                // not valid, retry
-                result.clear();
-                return recode(component, encoding, actionTable, true);
+                result.resize(origSize);
+                return recode(result, begin, end, encoding, actionTable, true);
            }

            if (decoded >= 0x80) {
                // decode the UTF-8 sequence
                if (encoding & QUrl::DecodeUnicode &&
-                        encodedUtf8ToUcs4(result, output, input, end, decoded))
+                        encodedUtf8ToUtf16(result, output, begin, input, end, decoded))
                    continue;

                // decoding the encoded UTF-8 failed
@ -416,7 +415,7 @@ non_trivial:
            decoded = c;
            if (decoded >= 0x80 && (encoding & QUrl::DecodeUnicode) == 0) {
                // encode the UTF-8 sequence
-                unicodeToEncodedUtf8(result, output, input, end, decoded);
+                unicodeToEncodedUtf8(result, output, begin, input, end, decoded);
                continue;
            } else if (decoded >= 0x80) {
                if (output)
@ -437,34 +436,37 @@ non_trivial:
        if (c == '%' && action != DecodeCharacter) {
            // cases 5 and 6: it's encoded and we're leaving it as it is
            // except we're pedantic and we'll uppercase the hex
-            if (output || !isUpperHex(input[0]) || !isUpperHex(input[1])) {
-                ensureDetached(result, output, input, end);
+            if (output || !isUpperHex(input[1]) || !isUpperHex(input[2])) {
+                ensureDetached(result, output, begin, input, end);
                *output++ = '%';
-                *output++ = toUpperHex(*input++);
-                *output++ = toUpperHex(*input++);
+                *output++ = toUpperHex(*++input);
+                *output++ = toUpperHex(*++input);
            }
        } else if (c == '%' && action == DecodeCharacter) {
            // case 4: we need to decode
-            ensureDetached(result, output, input, end);
+            ensureDetached(result, output, begin, input, end);
            *output++ = decoded;
            input += 2;
        } else {
            // must be case 3: we need to encode
-            ensureDetached(result, output, input, end);
+            ensureDetached(result, output, begin, input, end);
            *output++ = '%';
            *output++ = encodeNibble(c >> 4);
            *output++ = encodeNibble(c & 0xf);
        }
    }

-    if (output)
-        result.truncate(output - reinterpret_cast<const ushort *>(result.constData()));
-    return result;
+    if (output) {
+        int len = output - reinterpret_cast<const ushort *>(result.constData());
+        result.truncate(len);
+        return len - origSize;
+    }
+    return 0;
 }

-Q_AUTOTEST_EXPORT QString
-qt_urlRecode(const QString &component, QUrl::ComponentFormattingOptions encoding,
-             const ushort *tableModifications)
+Q_AUTOTEST_EXPORT int
+qt_urlRecode(QString &appendTo, const QChar *begin, const QChar *end,
+             QUrl::ComponentFormattingOptions encoding, const ushort *tableModifications)
 {
    uchar actionTable[sizeof defaultActionTable];
    if (encoding & QUrl::DecodeAllDelimiters) {
@ -487,7 +489,8 @@ qt_urlRecode(const QString &component, QUrl::ComponentFormattingOptions encoding
            actionTable[uchar(*p) - ' '] = *p >> 8;
    }

-    return recode(component, encoding, actionTable, false);
+    return recode(appendTo, reinterpret_cast<const ushort *>(begin), reinterpret_cast<const ushort *>(end),
+                  encoding, actionTable, false);
 }

 QT_END_NAMESPACE
--- a/tests/auto/corelib/io/qurlinternal/tst_qurlinternal.cpp
+++ b/tests/auto/corelib/io/qurlinternal/tst_qurlinternal.cpp
@ -50,8 +50,8 @@ Q_CORE_EXPORT extern void qt_nameprep(QString *source, int from);
 Q_CORE_EXPORT extern bool qt_check_std3rules(const QChar *, int);
 Q_CORE_EXPORT void qt_punycodeEncoder(const QChar *s, int ucLength, QString *output);
 Q_CORE_EXPORT QString qt_punycodeDecoder(const QString &pc);
-Q_CORE_EXPORT QString qt_urlRecode(const QString &component, QUrl::ComponentFormattingOptions encoding,
-                                   const ushort *tableModifications = 0);
+Q_CORE_EXPORT int qt_urlRecode(QString &appendTo, const QChar *input, const QChar *end,
+                               QUrl::ComponentFormattingOptions encoding, const ushort *tableModifications = 0);
 QT_END_NAMESPACE

 // For testsuites
@ -762,7 +762,6 @@ void tst_QUrlInternal::correctEncodedMistakes_data()
    QTest::addColumn<QString>("input");
    QTest::addColumn<QString>("expected");

-    QTest::newRow("null") << QString() << QString();
    QTest::newRow("empty") << "" << "";

    // these contain one invalid percent
@ -808,9 +807,13 @@ void tst_QUrlInternal::correctEncodedMistakes()
    QFETCH(QString, input);
    QFETCH(QString, expected);

-    QString output = qt_urlRecode(input, QUrl::DecodeUnicode);
+    // prepend some data to be sure that it remains there
+    QString output = QTest::currentDataTag();
+    expected.prepend(output);
+
+    if (!qt_urlRecode(output, input.constData(), input.constData() + input.length(), QUrl::DecodeUnicode))
+        output += input;
    QCOMPARE(output, expected);
-    QCOMPARE(output.isNull(), expected.isNull());
 }

 static void addUtf8Data(const char *name, const char *data)
@ -893,7 +896,7 @@ void tst_QUrlInternal::encodingRecode_data()
    addUtf8Data("utf8-3char-2", "\xED\x9F\xBF"); // U+D7FF
    addUtf8Data("utf8-3char-3", "\xEE\x80\x80"); // U+E000
    addUtf8Data("utf8-3char-4", "\xEF\xBF\xBD"); // U+FFFD
-    addUtf8Data("utf8-2char-1", "\xF0\x90\x80\x80"); // U+10000
+    addUtf8Data("utf8-4char-1", "\xF0\x90\x80\x80"); // U+10000
    addUtf8Data("utf8-4char-2", "\xF4\x8F\xBF\xBD"); // U+10FFFD

    // longer UTF-8 sequences, mixed with unreserved
@ -931,9 +934,13 @@ void tst_QUrlInternal::encodingRecode()
    QFETCH(QString, expected);
    QFETCH(QUrl::ComponentFormattingOptions, encodingMode);

-    QString output = qt_urlRecode(input, encodingMode);
+    // prepend some data to be sure that it remains there
+    QString output = QTest::currentDataTag();
+    expected.prepend(output);
+
+    if (!qt_urlRecode(output, input.constData(), input.constData() + input.length(), encodingMode))
+        output += input;
    QCOMPARE(output, expected);
-    QCOMPARE(output.isNull(), expected.isNull());
 }

 void tst_QUrlInternal::encodingRecodeInvalidUtf8_data()
@ -957,13 +964,18 @@ void tst_QUrlInternal::encodingRecodeInvalidUtf8()
    QFETCH(QByteArray, utf8);
    QString input = utf8.toPercentEncoding();

-    QString output;
-    output = qt_urlRecode(input, QUrl::DecodeUnicode);
-    QCOMPARE(output, input);
+    // prepend some data to be sure that it remains there
+    QString output = QTest::currentDataTag();
+
+    if (!qt_urlRecode(output, input.constData(), input.constData() + input.length(), QUrl::DecodeUnicode))
+        output += input;
+    QCOMPARE(output, QTest::currentDataTag() + input);

    // this is just control
-    output = qt_urlRecode(input, QUrl::FullyEncoded);
-    QCOMPARE(output, input);
+    output = QTest::currentDataTag();
+    if (!qt_urlRecode(output, input.constData(), input.constData() + input.length(), QUrl::FullyEncoded))
+        output += input;
+    QCOMPARE(output, QTest::currentDataTag() + input);
 }

 QTEST_APPLESS_MAIN(tst_QUrlInternal)