Close #1149 IDNA and Punycode support in url.parse
Using @bnoordhuis's punycode lib. Close #1174 also
This commit is contained in:
parent
08a334fa45
commit
2a848fa727
2
LICENSE
2
LICENSE
@ -69,3 +69,5 @@ The externally maintained libraries used by Node are:
|
|||||||
|
|
||||||
- lib/buffer_ieee754.js is copyright 2008 Fair Oaks Labs, Inc. and released
|
- lib/buffer_ieee754.js is copyright 2008 Fair Oaks Labs, Inc. and released
|
||||||
under the New BSD license.
|
under the New BSD license.
|
||||||
|
|
||||||
|
- lib/punycode.js is copyright 2011 Ben Noordhuis and released under the MIT license.
|
||||||
|
218
lib/punycode.js
Normal file
218
lib/punycode.js
Normal file
@ -0,0 +1,218 @@
|
|||||||
|
// Copyright (C) 2011 by Ben Noordhuis <info@bnoordhuis.nl>
|
||||||
|
//
|
||||||
|
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||||
|
// of this software and associated documentation files (the "Software"), to deal
|
||||||
|
// in the Software without restriction, including without limitation the rights
|
||||||
|
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
// copies of the Software, and to permit persons to whom the Software is
|
||||||
|
// furnished to do so, subject to the following conditions:
|
||||||
|
//
|
||||||
|
// The above copyright notice and this permission notice shall be included in
|
||||||
|
// all copies or substantial portions of the Software.
|
||||||
|
//
|
||||||
|
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||||
|
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||||
|
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||||
|
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||||
|
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||||
|
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||||
|
// THE SOFTWARE.
|
||||||
|
|
||||||
|
exports.encode = encode;
|
||||||
|
exports.decode = decode;
|
||||||
|
|
||||||
|
var TMIN = 1;
|
||||||
|
var TMAX = 26;
|
||||||
|
var BASE = 36;
|
||||||
|
var SKEW = 38;
|
||||||
|
var DAMP = 700; // initial bias scaler
|
||||||
|
var INITIAL_N = 128;
|
||||||
|
var INITIAL_BIAS = 72;
|
||||||
|
|
||||||
|
function adapt_bias(delta, n_points, is_first) {
|
||||||
|
// scale back, then increase delta
|
||||||
|
delta /= is_first ? DAMP : 2;
|
||||||
|
delta += ~~(delta / n_points);
|
||||||
|
|
||||||
|
var s = (BASE - TMIN);
|
||||||
|
var t = ~~((s * TMAX) / 2); // threshold=455
|
||||||
|
|
||||||
|
for (var k = 0; delta > t; k += BASE) {
|
||||||
|
delta = ~~(delta / s);
|
||||||
|
}
|
||||||
|
|
||||||
|
var a = (BASE - TMIN + 1) * delta;
|
||||||
|
var b = (delta + SKEW);
|
||||||
|
|
||||||
|
return k + ~~(a / b);
|
||||||
|
}
|
||||||
|
|
||||||
|
function next_smallest_codepoint(codepoints, n) {
|
||||||
|
var m = 0x110000; // unicode upper bound + 1
|
||||||
|
|
||||||
|
for (var i = 0, len = codepoints.length; i < len; ++i) {
|
||||||
|
var c = codepoints[i];
|
||||||
|
if (c >= n && c < m) {
|
||||||
|
m = c;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// sanity check - should not happen
|
||||||
|
if (m >= 0x110000) {
|
||||||
|
throw new Error('Next smallest code point not found.');
|
||||||
|
}
|
||||||
|
|
||||||
|
return m;
|
||||||
|
}
|
||||||
|
|
||||||
|
function encode_digit(d) {
|
||||||
|
return d + (d < 26 ? 97 : 22);
|
||||||
|
}
|
||||||
|
|
||||||
|
function decode_digit(d) {
|
||||||
|
if (d >= 48 && d <= 57) {
|
||||||
|
return d - 22; // 0..9
|
||||||
|
}
|
||||||
|
if (d >= 65 && d <= 90) {
|
||||||
|
return d - 65; // A..Z
|
||||||
|
}
|
||||||
|
if (d >= 97 && d <= 122) {
|
||||||
|
return d - 97; // a..z
|
||||||
|
}
|
||||||
|
throw new Error('Illegal digit #' + d);
|
||||||
|
}
|
||||||
|
|
||||||
|
function threshold(k, bias) {
|
||||||
|
if (k <= bias + TMIN) {
|
||||||
|
return TMIN;
|
||||||
|
}
|
||||||
|
if (k >= bias + TMAX) {
|
||||||
|
return TMAX;
|
||||||
|
}
|
||||||
|
return k - bias;
|
||||||
|
}
|
||||||
|
|
||||||
|
function encode_int(bias, delta) {
|
||||||
|
var result = [];
|
||||||
|
|
||||||
|
for (var k = BASE, q = delta;; k += BASE) {
|
||||||
|
var t = threshold(k, bias);
|
||||||
|
if (q < t) {
|
||||||
|
result.push(encode_digit(q));
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
result.push(encode_digit(t + ((q - t) % (BASE - t))));
|
||||||
|
q = ~~((q - t) / (BASE - t));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
function encode(input) {
|
||||||
|
if (typeof input != 'string') {
|
||||||
|
throw new Error('Argument must be a string.');
|
||||||
|
}
|
||||||
|
|
||||||
|
input = input.split('').map(function(c) {
|
||||||
|
return c.charCodeAt(0);
|
||||||
|
});
|
||||||
|
|
||||||
|
var output = [];
|
||||||
|
var non_basic = [];
|
||||||
|
|
||||||
|
for (var i = 0, len = input.length; i < len; ++i) {
|
||||||
|
var c = input[i];
|
||||||
|
if (c < 128) {
|
||||||
|
output.push(c);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
non_basic.push(c);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
var b, h;
|
||||||
|
b = h = output.length;
|
||||||
|
|
||||||
|
if (b) {
|
||||||
|
output.push(45); // delimiter '-'
|
||||||
|
}
|
||||||
|
|
||||||
|
var n = INITIAL_N;
|
||||||
|
var bias = INITIAL_BIAS;
|
||||||
|
var delta = 0;
|
||||||
|
|
||||||
|
for (var len = input.length; h < len; ++n, ++delta) {
|
||||||
|
var m = next_smallest_codepoint(non_basic, n);
|
||||||
|
delta += (m - n) * (h + 1);
|
||||||
|
n = m;
|
||||||
|
|
||||||
|
for (var i = 0; i < len; ++i) {
|
||||||
|
var c = input[i];
|
||||||
|
if (c < n) {
|
||||||
|
if (++delta == 0) {
|
||||||
|
throw new Error('Delta overflow.');
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else if (c == n) {
|
||||||
|
// TODO append in-place?
|
||||||
|
// i.e. -> output.push.apply(output, encode_int(bias, delta));
|
||||||
|
output = output.concat(encode_int(bias, delta));
|
||||||
|
bias = adapt_bias(delta, h + 1, b == h);
|
||||||
|
delta = 0;
|
||||||
|
h++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return String.fromCharCode.apply(String, output);
|
||||||
|
}
|
||||||
|
|
||||||
|
function decode(input) {
|
||||||
|
if (typeof input != 'string') {
|
||||||
|
throw new Error('Argument must be a string.');
|
||||||
|
}
|
||||||
|
|
||||||
|
// find basic code points/delta separator
|
||||||
|
var b = 1 + input.lastIndexOf('-');
|
||||||
|
|
||||||
|
input = input.split('').map(function(c) {
|
||||||
|
return c.charCodeAt(0);
|
||||||
|
});
|
||||||
|
|
||||||
|
// start with a copy of the basic code points
|
||||||
|
var output = input.slice(0, b ? (b - 1) : 0);
|
||||||
|
|
||||||
|
var n = INITIAL_N;
|
||||||
|
var bias = INITIAL_BIAS;
|
||||||
|
|
||||||
|
for (var i = 0, len = input.length; b < len; ++i) {
|
||||||
|
var org_i = i;
|
||||||
|
|
||||||
|
for (var k = BASE, w = 1;; k += BASE) {
|
||||||
|
var d = decode_digit(input[b++]);
|
||||||
|
|
||||||
|
// TODO overflow check
|
||||||
|
i += d * w;
|
||||||
|
|
||||||
|
var t = threshold(k, bias);
|
||||||
|
if (d < t) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
// TODO overflow check
|
||||||
|
w *= BASE - t;
|
||||||
|
}
|
||||||
|
|
||||||
|
var x = 1 + output.length;
|
||||||
|
bias = adapt_bias(i - org_i, x, org_i == 0);
|
||||||
|
// TODO overflow check
|
||||||
|
n += ~~(i / x);
|
||||||
|
i %= x;
|
||||||
|
|
||||||
|
output.splice(i, 0, n);
|
||||||
|
}
|
||||||
|
|
||||||
|
return String.fromCharCode.apply(String, output);
|
||||||
|
}
|
54
lib/url.js
54
lib/url.js
@ -19,6 +19,8 @@
|
|||||||
// OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
|
// OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
|
||||||
// USE OR OTHER DEALINGS IN THE SOFTWARE.
|
// USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||||
|
|
||||||
|
var punycode = require('punycode');
|
||||||
|
|
||||||
exports.parse = urlParse;
|
exports.parse = urlParse;
|
||||||
exports.resolve = urlResolve;
|
exports.resolve = urlResolve;
|
||||||
exports.resolveObject = urlResolveObject;
|
exports.resolveObject = urlResolveObject;
|
||||||
@ -183,24 +185,56 @@ function urlParse(url, parseQueryString, slashesDenoteHost) {
|
|||||||
var part = hostparts[i];
|
var part = hostparts[i];
|
||||||
if (!part) continue;
|
if (!part) continue;
|
||||||
if (!part.match(hostnamePartPattern)) {
|
if (!part.match(hostnamePartPattern)) {
|
||||||
var validParts = hostparts.slice(0, i);
|
var newpart = '';
|
||||||
var notHost = hostparts.slice(i + 1);
|
for (var j = 0, k = part.length; j < k; j++) {
|
||||||
var bit = part.match(hostnamePartStart);
|
if (part.charCodeAt(j) > 127) {
|
||||||
if (bit) {
|
// we replace non-ASCII char with a temporary placeholder
|
||||||
validParts.push(bit[1]);
|
// we need this to make sure size of hostname is not
|
||||||
notHost.unshift(bit[2]);
|
// broken by replacing non-ASCII by nothing
|
||||||
|
newpart += 'x';
|
||||||
|
} else {
|
||||||
|
newpart += part[j];
|
||||||
|
}
|
||||||
}
|
}
|
||||||
if (notHost.length) {
|
// we test again with ASCII char only
|
||||||
rest = '/' + notHost.join('.') + rest
|
if (!newpart.match(hostnamePartPattern)) {
|
||||||
|
var validParts = hostparts.slice(0, i);
|
||||||
|
var notHost = hostparts.slice(i + 1);
|
||||||
|
var bit = part.match(hostnamePartStart);
|
||||||
|
if (bit) {
|
||||||
|
validParts.push(bit[1]);
|
||||||
|
notHost.unshift(bit[2]);
|
||||||
|
}
|
||||||
|
if (notHost.length) {
|
||||||
|
rest = '/' + notHost.join('.') + rest;
|
||||||
|
}
|
||||||
|
out.hostname = validParts.join('.');
|
||||||
|
break;
|
||||||
}
|
}
|
||||||
out.hostname = validParts.join('.');
|
|
||||||
break;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// hostnames are always lower case.
|
// hostnames are always lower case.
|
||||||
out.hostname = out.hostname.toLowerCase();
|
out.hostname = out.hostname.toLowerCase();
|
||||||
|
|
||||||
|
// IDNA Support: Returns a puny coded representation of "domain".
|
||||||
|
// It only converts the part of the domain name that
|
||||||
|
// has non ASCII characters. I.e. it dosent matter if
|
||||||
|
// you call it with a domain that already is in ASCII.
|
||||||
|
try {
|
||||||
|
var domainArray = out.hostname.split('.');
|
||||||
|
var newOut = [];
|
||||||
|
for (var i = 0; i < domainArray.length; ++i) {
|
||||||
|
var s = domainArray[i];
|
||||||
|
newOut.push(s.match(/[^A-Za-z0-9-]/) ?
|
||||||
|
'xn--' + punycode.encode(s) : s);
|
||||||
|
}
|
||||||
|
out.hostname = newOut.join('.');
|
||||||
|
} catch (e) {
|
||||||
|
// if encode fail for some reason, we just do the classic behavior.
|
||||||
|
}
|
||||||
|
|
||||||
out.host = ((out.auth) ? out.auth + '@' : '') +
|
out.host = ((out.auth) ? out.auth + '@' : '') +
|
||||||
(out.hostname || '') +
|
(out.hostname || '') +
|
||||||
((out.port) ? ':' + out.port : '');
|
((out.port) ? ':' + out.port : '');
|
||||||
|
38
test/simple/test-punycode.js
Normal file
38
test/simple/test-punycode.js
Normal file
@ -0,0 +1,38 @@
|
|||||||
|
// Copyright (C) 2011 by Ben Noordhuis <info@bnoordhuis.nl>
|
||||||
|
//
|
||||||
|
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||||
|
// of this software and associated documentation files (the "Software"), to deal
|
||||||
|
// in the Software without restriction, including without limitation the rights
|
||||||
|
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
// copies of the Software, and to permit persons to whom the Software is
|
||||||
|
// furnished to do so, subject to the following conditions:
|
||||||
|
//
|
||||||
|
// The above copyright notice and this permission notice shall be included in
|
||||||
|
// all copies or substantial portions of the Software.
|
||||||
|
//
|
||||||
|
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||||
|
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||||
|
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||||
|
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||||
|
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||||
|
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||||
|
// THE SOFTWARE.
|
||||||
|
|
||||||
|
punycode = require('punycode');
|
||||||
|
assert = require('assert');
|
||||||
|
|
||||||
|
assert.equal(punycode.encode('ü'), 'tda');
|
||||||
|
assert.equal(punycode.encode('Goethe'), 'Goethe-');
|
||||||
|
assert.equal(punycode.encode('Bücher'), 'Bcher-kva');
|
||||||
|
assert.equal(punycode.encode(
|
||||||
|
'Willst du die Blüthe des frühen, die Früchte des späteren Jahres'),
|
||||||
|
'Willst du die Blthe des frhen, die Frchte des spteren Jahres-x9e96lkal');
|
||||||
|
assert.equal(punycode.encode('日本語'), 'wgv71a119e');
|
||||||
|
|
||||||
|
assert.equal(punycode.decode('tda'), 'ü');
|
||||||
|
assert.equal(punycode.decode('Goethe-'), 'Goethe');
|
||||||
|
assert.equal(punycode.decode('Bcher-kva'), 'Bücher');
|
||||||
|
assert.equal(punycode.decode(
|
||||||
|
'Willst du die Blthe des frhen, die Frchte des spteren Jahres-x9e96lkal'),
|
||||||
|
'Willst du die Blüthe des frühen, die Früchte des späteren Jahres');
|
||||||
|
assert.equal(punycode.decode('wgv71a119e'), '日本語');
|
@ -79,7 +79,7 @@ var parseTests = {
|
|||||||
'protocol': 'http:',
|
'protocol': 'http:',
|
||||||
'host': 'x.com',
|
'host': 'x.com',
|
||||||
'hostname': 'x.com',
|
'hostname': 'x.com',
|
||||||
'pathname': '/Y',
|
'pathname': '/Y'
|
||||||
},
|
},
|
||||||
// an unexpected invalid char in the hostname.
|
// an unexpected invalid char in the hostname.
|
||||||
'HtTp://x.y.cOm*a/b/c?d=e#f g<h>i' : {
|
'HtTp://x.y.cOm*a/b/c?d=e#f g<h>i' : {
|
||||||
@ -113,7 +113,7 @@ var parseTests = {
|
|||||||
},
|
},
|
||||||
'http://x/p/"quoted"': {
|
'http://x/p/"quoted"': {
|
||||||
'href': 'http://x/p/',
|
'href': 'http://x/p/',
|
||||||
'protocol':'http:',
|
'protocol': 'http:',
|
||||||
'host': 'x',
|
'host': 'x',
|
||||||
'hostname': 'x',
|
'hostname': 'x',
|
||||||
'pathname': '/p/'
|
'pathname': '/p/'
|
||||||
@ -274,6 +274,59 @@ var parseTests = {
|
|||||||
'search' : '?search=foo',
|
'search' : '?search=foo',
|
||||||
'query' : 'search=foo',
|
'query' : 'search=foo',
|
||||||
'hash' : '#bar'
|
'hash' : '#bar'
|
||||||
|
},
|
||||||
|
// IDNA tests
|
||||||
|
'http://www.日本語.com/' : {
|
||||||
|
'href': 'http://www.xn--wgv71a119e.com/',
|
||||||
|
'protocol': 'http:',
|
||||||
|
'host': 'www.xn--wgv71a119e.com',
|
||||||
|
'hostname': 'www.xn--wgv71a119e.com',
|
||||||
|
'pathname': '/'
|
||||||
|
},
|
||||||
|
'http://example.Bücher.com/' : {
|
||||||
|
'href': 'http://example.xn--bcher-kva.com/',
|
||||||
|
'protocol': 'http:',
|
||||||
|
'host': 'example.xn--bcher-kva.com',
|
||||||
|
'hostname': 'example.xn--bcher-kva.com',
|
||||||
|
'pathname': '/'
|
||||||
|
},
|
||||||
|
'http://www.Äffchen.com/' : {
|
||||||
|
'href': 'http://www.xn--ffchen-9ta.com/',
|
||||||
|
'protocol': 'http:',
|
||||||
|
'host': 'www.xn--ffchen-9ta.com',
|
||||||
|
'hostname': 'www.xn--ffchen-9ta.com',
|
||||||
|
'pathname': '/'
|
||||||
|
},
|
||||||
|
'http://www.Äffchen.cOm*A/b/c?d=e#f g<h>i' : {
|
||||||
|
'href': 'http://www.xn--ffchen-9ta.com/*A/b/c?d=e#f',
|
||||||
|
'protocol': 'http:',
|
||||||
|
'host': 'www.xn--ffchen-9ta.com',
|
||||||
|
'hostname': 'www.xn--ffchen-9ta.com',
|
||||||
|
'pathname': '/*A/b/c',
|
||||||
|
'search': '?d=e',
|
||||||
|
'query': 'd=e',
|
||||||
|
'hash': '#f'
|
||||||
|
},
|
||||||
|
'http://SÉLIER.COM/' : {
|
||||||
|
'href': 'http://xn--slier-bsa.com/',
|
||||||
|
'protocol': 'http:',
|
||||||
|
'host': 'xn--slier-bsa.com',
|
||||||
|
'hostname': 'xn--slier-bsa.com',
|
||||||
|
'pathname': '/'
|
||||||
|
},
|
||||||
|
'http://ليهمابتكلموشعربي؟.ي؟/' : {
|
||||||
|
'href': 'http://xn--egbpdaj6bu4bxfgehfvwxn.xn--egb9f/',
|
||||||
|
'protocol': 'http:',
|
||||||
|
'host': 'xn--egbpdaj6bu4bxfgehfvwxn.xn--egb9f',
|
||||||
|
'hostname': 'xn--egbpdaj6bu4bxfgehfvwxn.xn--egb9f',
|
||||||
|
'pathname': '/'
|
||||||
|
},
|
||||||
|
'http://➡.ws/➡' : {
|
||||||
|
'href': 'http://xn--hgi.ws/➡',
|
||||||
|
'protocol': 'http:',
|
||||||
|
'host': 'xn--hgi.ws',
|
||||||
|
'hostname': 'xn--hgi.ws',
|
||||||
|
'pathname': '/➡'
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
for (var u in parseTests) {
|
for (var u in parseTests) {
|
||||||
|
Loading…
x
Reference in New Issue
Block a user