querystring: improve parse() performance

This commit improves parse() performance by ~20-200% with the various querystring-parse benchmarks. Some optimization strategies used in this commit include: * Combining multiple searches (for '&', '=', and '+') on the same string into a single loop * Avoiding string.split() * Minimizing creation of temporary strings * Avoiding string decoding if no encoded bytes were found and the default string decoder is being used PR-URL: https://github.com/nodejs/node/pull/5012 Reviewed-By: James M Snell <jasnell@gmail.com> Reviewed-By: Roman Reiss <me@silverwind.io> Reviewed-By: Matteo Collina <matteo.collina@gmail.com>
2016-01-31 13:54:18 -05:00 · 2016-01-31 13:54:18 -05:00 · a2a69a2b63
commit a2a69a2b63
parent 90451a67ca
3 changed files with 191 additions and 62 deletions
--- a/benchmark/querystring/querystring-parse.js
+++ b/benchmark/querystring/querystring-parse.js
@ -3,7 +3,13 @@ var querystring = require('querystring');
 var v8 = require('v8');

 var bench = common.createBenchmark(main, {
-  type: ['noencode', 'encodemany', 'encodelast', 'multivalue'],
+  type: ['noencode',
+         'multicharsep',
+         'encodemany',
+         'encodelast',
+         'multivalue',
+         'multivaluemany',
+         'manypairs'],
  n: [1e6],
 });

@ -13,22 +19,38 @@ function main(conf) {

  var inputs = {
    noencode: 'foo=bar&baz=quux&xyzzy=thud',
+    multicharsep: 'foo=bar&&&&&&&&&&baz=quux&&&&&&&&&&xyzzy=thud',
    encodemany: '%66%6F%6F=bar&%62%61%7A=quux&xyzzy=%74h%75d',
    encodelast: 'foo=bar&baz=quux&xyzzy=thu%64',
-    multivalue: 'foo=bar&foo=baz&foo=quux&quuy=quuz'
+    multivalue: 'foo=bar&foo=baz&foo=quux&quuy=quuz',
+    multivaluemany: 'foo=bar&foo=baz&foo=quux&quuy=quuz&foo=abc&foo=def&' +
+                    'foo=ghi&foo=jkl&foo=mno&foo=pqr&foo=stu&foo=vwxyz',
+    manypairs: 'a&b&c&d&e&f&g&h&i&j&k&l&m&n&o&p&q&r&s&t&u&v&w&x&y&z'
  };
  var input = inputs[type];

  // Force-optimize querystring.parse() so that the benchmark doesn't get
  // disrupted by the optimizer kicking in halfway through.
-  for (var name in inputs)
-    querystring.parse(inputs[name]);
-
  v8.setFlagsFromString('--allow_natives_syntax');
-  eval('%OptimizeFunctionOnNextCall(querystring.parse)');
-
-  bench.start();
-  for (var i = 0; i < n; i += 1)
+  if (type !== 'multicharsep') {
    querystring.parse(input);
-  bench.end(n);
+    eval('%OptimizeFunctionOnNextCall(querystring.parse)');
+    querystring.parse(input);
+  } else {
+    querystring.parse(input, '&&&&&&&&&&');
+    eval('%OptimizeFunctionOnNextCall(querystring.parse)');
+    querystring.parse(input, '&&&&&&&&&&');
+  }
+
+  if (type !== 'multicharsep') {
+    bench.start();
+    for (var i = 0; i < n; i += 1)
+      querystring.parse(input);
+    bench.end(n);
+  } else {
+    bench.start();
+    for (var i = 0; i < n; i += 1)
+      querystring.parse(input, '&&&&&&&&&&');
+    bench.end(n);
+  }
 }
--- a/lib/querystring.js
+++ b/lib/querystring.js
@ -78,13 +78,14 @@ QueryString.unescapeBuffer = function(s, decodeSpaces) {
 };


-QueryString.unescape = function(s, decodeSpaces) {
+function qsUnescape(s, decodeSpaces) {
  try {
    return decodeURIComponent(s);
  } catch (e) {
    return QueryString.unescapeBuffer(s, decodeSpaces).toString();
  }
-};
+}
+QueryString.unescape = qsUnescape;


 var hexTable = new Array(256);
@ -198,63 +199,183 @@ QueryString.stringify = QueryString.encode = function(obj, sep, eq, options) {
  return '';
 };

-// Parse a key=val string.
+// Parse a key/val string.
 QueryString.parse = QueryString.decode = function(qs, sep, eq, options) {
  sep = sep || '&';
  eq = eq || '=';
-  const eqLen = eq.length;
-  var obj = {};
+
+  const obj = {};

  if (typeof qs !== 'string' || qs.length === 0) {
    return obj;
  }

+  if (typeof sep !== 'string')
+    sep += '';
+
+  const eqLen = eq.length;
+  const sepLen = sep.length;
+
  var maxKeys = 1000;
  if (options && typeof options.maxKeys === 'number') {
    maxKeys = options.maxKeys;
  }

-  // maxKeys <= 0 means that we should not limit keys count
-  if (maxKeys > 0 && isFinite(maxKeys)) {
-    qs = qs.split(sep, maxKeys);
-  } else {
-    qs = qs.split(sep);
-  }
-
-  var len = qs.length;
+  var pairs = Infinity;
+  if (maxKeys > 0)
+    pairs = maxKeys;

  var decode = QueryString.unescape;
  if (options && typeof options.decodeURIComponent === 'function') {
    decode = options.decodeURIComponent;
  }
+  const customDecode = (decode !== qsUnescape);

-  var keys = [];
-  for (var i = 0; i < len; ++i) {
-    // replacePlus() is used instead of a regexp because it is ~15-30% faster
-    // with v8 4.7
-    const x = replacePlus(qs[i]);
-    const idx = x.indexOf(eq);
-    var k, v;
+  const keys = [];
+  var lastPos = 0;
+  var sepIdx = 0;
+  var eqIdx = 0;
+  var key = '';
+  var value = '';
+  var keyEncoded = customDecode;
+  var valEncoded = customDecode;
+  var encodeCheck = 0;
+  for (var i = 0; i < qs.length; ++i) {
+    const code = qs.charCodeAt(i);

-    if (idx >= 0) {
-      k = decodeStr(x.substring(0, idx), decode);
-      v = decodeStr(x.substring(idx + eqLen), decode);
+    // Try matching key/value pair separator (e.g. '&')
+    if (code === sep.charCodeAt(sepIdx)) {
+      if (++sepIdx === sepLen) {
+        // Key/value pair separator match!
+        const end = i - sepIdx + 1;
+        if (eqIdx < eqLen) {
+          // If we didn't find the key/value separator, treat the substring as
+          // part of the key instead of the value
+          if (lastPos < end)
+            key += qs.slice(lastPos, end);
+        } else if (lastPos < end)
+          value += qs.slice(lastPos, end);
+        if (keyEncoded)
+          key = decodeStr(key, decode);
+        if (valEncoded)
+          value = decodeStr(value, decode);
+        // Use a key array lookup instead of using hasOwnProperty(), which is
+        // slower
+        if (keys.indexOf(key) === -1) {
+          obj[key] = value;
+          keys[keys.length] = key;
+        } else {
+          const curValue = obj[key];
+          // `instanceof Array` is used instead of Array.isArray() because it
+          // is ~15-20% faster with v8 4.7 and is safe to use because we are
+          // using it with values being created within this function
+          if (curValue instanceof Array)
+            curValue[curValue.length] = value;
+          else
+            obj[key] = [curValue, value];
+        }
+        if (--pairs === 0)
+          break;
+        keyEncoded = valEncoded = customDecode;
+        encodeCheck = 0;
+        key = value = '';
+        lastPos = i + 1;
+        sepIdx = eqIdx = 0;
+      }
+      continue;
    } else {
-      k = decodeStr(x, decode);
-      v = '';
+      sepIdx = 0;
+      if (!valEncoded) {
+        // Try to match an (valid) encoded byte (once) to minimize unnecessary
+        // calls to string decoding functions
+        if (code === 37/*%*/) {
+          encodeCheck = 1;
+        } else if (encodeCheck > 0 &&
+                   ((code >= 48/*0*/ && code <= 57/*9*/) ||
+                    (code >= 65/*A*/ && code <= 70/*Z*/) ||
+                    (code >= 97/*a*/ && code <= 102/*z*/))) {
+          if (++encodeCheck === 3)
+            valEncoded = true;
+        } else {
+          encodeCheck = 0;
+        }
+      }
    }

-    // Use a key array lookup instead of using hasOwnProperty(), which is slower
-    if (keys.indexOf(k) === -1) {
-      obj[k] = v;
-      keys.push(k);
-    } else if (obj[k] instanceof Array) {
-      // `instanceof Array` is used instead of Array.isArray() because it is
-      // ~15-20% faster with v8 4.7 and is safe to use because we are using it
-      // with values being created within this function
-      obj[k].push(v);
+    // Try matching key/value separator (e.g. '=') if we haven't already
+    if (eqIdx < eqLen) {
+      if (code === eq.charCodeAt(eqIdx)) {
+        if (++eqIdx === eqLen) {
+          // Key/value separator match!
+          const end = i - eqIdx + 1;
+          if (lastPos < end)
+            key += qs.slice(lastPos, end);
+          encodeCheck = 0;
+          lastPos = i + 1;
+        }
+        continue;
+      } else {
+        eqIdx = 0;
+        if (!keyEncoded) {
+          // Try to match an (valid) encoded byte once to minimize unnecessary
+          // calls to string decoding functions
+          if (code === 37/*%*/) {
+            encodeCheck = 1;
+          } else if (encodeCheck > 0 &&
+                     ((code >= 48/*0*/ && code <= 57/*9*/) ||
+                      (code >= 65/*A*/ && code <= 70/*Z*/) ||
+                      (code >= 97/*a*/ && code <= 102/*z*/))) {
+            if (++encodeCheck === 3)
+              keyEncoded = true;
+          } else {
+            encodeCheck = 0;
+          }
+        }
+      }
+    }
+
+    if (code === 43/*+*/) {
+      if (eqIdx < eqLen) {
+        if (i - lastPos > 0)
+          key += qs.slice(lastPos, i);
+        key += '%20';
+        keyEncoded = true;
+      } else {
+        if (i - lastPos > 0)
+          value += qs.slice(lastPos, i);
+        value += '%20';
+        valEncoded = true;
+      }
+      lastPos = i + 1;
+    }
+  }
+
+  // Check if we have leftover key or value data
+  if (pairs > 0 && (lastPos < qs.length || eqIdx > 0)) {
+    if (lastPos < qs.length) {
+      if (eqIdx < eqLen)
+        key += qs.slice(lastPos);
+      else if (sepIdx < sepLen)
+        value += qs.slice(lastPos);
+    }
+    if (keyEncoded)
+      key = decodeStr(key, decode);
+    if (valEncoded)
+      value = decodeStr(value, decode);
+    // Use a key array lookup instead of using hasOwnProperty(), which is
+    // slower
+    if (keys.indexOf(key) === -1) {
+      obj[key] = value;
+      keys[keys.length] = key;
    } else {
-      obj[k] = [obj[k], v];
+      const curValue = obj[key];
+      // `instanceof Array` is used instead of Array.isArray() because it
+      // is ~15-20% faster with v8 4.7 and is safe to use because we are
+      // using it with values being created within this function
+      if (curValue instanceof Array)
+        curValue[curValue.length] = value;
+      else
+        obj[key] = [curValue, value];
    }
  }

@ -262,23 +383,6 @@ QueryString.parse = QueryString.decode = function(qs, sep, eq, options) {
 };


-function replacePlus(str) {
-  var ret = '';
-  var start = 0;
-  var i = -1;
-  while ((i = str.indexOf('+', i + 1)) !== -1) {
-    ret += str.slice(start, i);
-    ret += '%20';
-    start = i + 1;
-  }
-  if (start === 0)
-    return str;
-  if (start < str.length)
-    ret += str.slice(start);
-  return ret;
-}
-
-
 // v8 does not optimize functions with try-catch blocks, so we isolate them here
 // to minimize the damage
 function decodeStr(s, decoder) {
--- a/test/parallel/test-querystring.js
+++ b/test/parallel/test-querystring.js
@ -248,3 +248,6 @@ qs.unescape = function(str) {
 };
 assert.deepEqual(qs.parse('foo=bor'), {f__: 'b_r'});
 qs.unescape = prevUnescape;
+
+// test separator and "equals" parsing order
+assert.deepEqual(qs.parse('foo&bar', '&', '&'), { foo: '', bar: '' });