* encoding.c (rb_enc_get_ascii): add an argument to provide the

length of the returned character.

* include/ruby/encoding.h (rb_enc_get_ascii): add the argument.

* re.c (rb_reg_expr_str): modify rb_enc_get_ascii call.
  (rb_reg_quote): ditto.
  (rb_reg_regsub): ditto.


git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@14190 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
This commit is contained in:
akr 2007-12-11 03:08:50 +00:00
parent 9ee1ab0e28
commit 5802768b40
4 changed files with 90 additions and 69 deletions

View File

@ -1,3 +1,14 @@
Tue Dec 11 12:05:51 2007 Tanaka Akira <akr@fsij.org>
* encoding.c (rb_enc_get_ascii): add an argument to provide the
length of the returned character.
* include/ruby/encoding.h (rb_enc_get_ascii): add the argument.
* re.c (rb_reg_expr_str): modify rb_enc_get_ascii call.
(rb_reg_quote): ditto.
(rb_reg_regsub): ditto.
Tue Dec 11 09:40:21 2007 Tanaka Akira <akr@fsij.org> Tue Dec 11 09:40:21 2007 Tanaka Akira <akr@fsij.org>
* include/ruby/oniguruma.h (ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE): * include/ruby/oniguruma.h (ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE):

View File

@ -505,22 +505,26 @@ rb_enc_precise_mbclen(const char *p, const char *e, rb_encoding *enc)
return n; return n;
} }
int rb_enc_get_ascii(const char *p, const char *e, rb_encoding *enc) int rb_enc_get_ascii(const char *p, const char *e, int *len, rb_encoding *enc)
{ {
int c, l; int c, l;
if (e <= p) if (e <= p)
return -1; return -1;
if (rb_enc_asciicompat(enc)) { if (rb_enc_asciicompat(enc)) {
c = (unsigned char)*p; c = (unsigned char)*p;
return ISASCII(c) ? c : -1; if (!ISASCII(c))
return -1;
if (len) *len = 1;
return c;
} }
l = rb_enc_precise_mbclen(p, e, enc); l = rb_enc_precise_mbclen(p, e, enc);
if (!MBCLEN_CHARFOUND(l)) if (!MBCLEN_CHARFOUND(l))
return -1; return -1;
c = rb_enc_codepoint(p, e, enc); c = rb_enc_codepoint(p, e, enc);
if (rb_enc_isascii(c, enc)) if (!rb_enc_isascii(c, enc))
return c;
return -1; return -1;
if (len) *len = l;
return c;
} }
int int

View File

@ -72,14 +72,14 @@ rb_encoding * rb_enc_find(const char *name);
/* ptr,endptr,encoding -> mbclen */ /* ptr,endptr,encoding -> mbclen */
int rb_enc_mbclen(const char*, const char *, rb_encoding*); int rb_enc_mbclen(const char*, const char *, rb_encoding*);
/* ptr,endptr,encoding -> chlen, invalid or needmore */ /* -> chlen, invalid or needmore */
int rb_enc_precise_mbclen(const char*, const char *, rb_encoding*); int rb_enc_precise_mbclen(const char *p, const char *e, rb_encoding *enc);
#define MBCLEN_CHARFOUND(ret) ONIGENC_MBCLEN_CHARFOUND(ret) #define MBCLEN_CHARFOUND(ret) ONIGENC_MBCLEN_CHARFOUND(ret)
#define MBCLEN_INVALID(ret) ONIGENC_MBCLEN_INVALID(ret) #define MBCLEN_INVALID(ret) ONIGENC_MBCLEN_INVALID(ret)
#define MBCLEN_NEEDMORE(ret) ONIGENC_MBCLEN_NEEDMORE(ret) #define MBCLEN_NEEDMORE(ret) ONIGENC_MBCLEN_NEEDMORE(ret)
/* ptr,endptr,encoding -> 0x00..0x7f, -1 */ /* -> 0x00..0x7f, -1 */
int rb_enc_get_ascii(const char*, const char *, rb_encoding*); int rb_enc_get_ascii(const char *p, const char *e, int *len, rb_encoding *enc);
/* code,encoding -> codelen */ /* code,encoding -> codelen */
int rb_enc_codelen(int, rb_encoding*); int rb_enc_codelen(int, rb_encoding*);

96
re.c
View File

@ -218,16 +218,21 @@ rb_reg_expr_str(VALUE str, const char *s, long len)
rb_encoding *enc = rb_enc_get(str); rb_encoding *enc = rb_enc_get(str);
const char *p, *pend; const char *p, *pend;
int need_escape = 0; int need_escape = 0;
int c; int c, clen;
p = s; pend = p + len; p = s; pend = p + len;
while (p<pend) { while (p<pend) {
c = rb_enc_get_ascii(p, pend, enc); c = rb_enc_get_ascii(p, pend, &clen, enc);
if (c == '/' || (c != -1 && !rb_enc_isprint(c, enc))) { if (c == -1) {
p += mbclen(p, pend, enc);
}
else if (c != '/' && rb_enc_isprint(c, enc)) {
p += clen;
}
else {
need_escape = 1; need_escape = 1;
break; break;
} }
p += mbclen(p, pend, enc);
} }
if (!need_escape) { if (!need_escape) {
rb_str_buf_cat(str, s, len); rb_str_buf_cat(str, s, len);
@ -235,9 +240,9 @@ rb_reg_expr_str(VALUE str, const char *s, long len)
else { else {
p = s; p = s;
while (p<pend) { while (p<pend) {
c = rb_enc_get_ascii(p, pend, enc); c = rb_enc_get_ascii(p, pend, &clen, enc);
if (c == '\\') { if (c == '\\' && p+clen < pend) {
int n = mbclen(p+1, pend, enc) + 1; int n = clen + mbclen(p+clen, pend, enc);
rb_str_buf_cat(str, p, n); rb_str_buf_cat(str, p, n);
p += n; p += n;
continue; continue;
@ -245,7 +250,7 @@ rb_reg_expr_str(VALUE str, const char *s, long len)
else if (c == '/') { else if (c == '/') {
char c = '\\'; char c = '\\';
rb_str_buf_cat(str, &c, 1); rb_str_buf_cat(str, &c, 1);
rb_str_buf_cat(str, p, 1); rb_str_buf_cat(str, p, clen);
} }
else if (c == -1) { else if (c == -1) {
int l = mbclen(p, pend, enc); int l = mbclen(p, pend, enc);
@ -254,7 +259,7 @@ rb_reg_expr_str(VALUE str, const char *s, long len)
continue; continue;
} }
else if (rb_enc_isprint(c, enc)) { else if (rb_enc_isprint(c, enc)) {
rb_str_buf_cat(str, p, 1); rb_str_buf_cat(str, p, clen);
} }
else if (!rb_enc_isspace(c, enc)) { else if (!rb_enc_isspace(c, enc)) {
char b[8]; char b[8];
@ -263,9 +268,9 @@ rb_reg_expr_str(VALUE str, const char *s, long len)
rb_str_buf_cat(str, b, 4); rb_str_buf_cat(str, b, 4);
} }
else { else {
rb_str_buf_cat(str, p, 1); rb_str_buf_cat(str, p, clen);
} }
p++; p += clen;
} }
} }
} }
@ -2376,19 +2381,15 @@ rb_reg_quote(VALUE str)
rb_encoding *enc = rb_enc_get(str); rb_encoding *enc = rb_enc_get(str);
char *s, *send, *t; char *s, *send, *t;
VALUE tmp; VALUE tmp;
int c; int c, clen;
int ascii_only = rb_enc_str_asciionly_p(str); int ascii_only = rb_enc_str_asciionly_p(str);
s = RSTRING_PTR(str); s = RSTRING_PTR(str);
send = s + RSTRING_LEN(str); send = s + RSTRING_LEN(str);
for (; s < send; s++) { while (s < send) {
c = rb_enc_get_ascii(s, send, enc); c = rb_enc_get_ascii(s, send, &clen, enc);
if (c == -1) { if (c == -1) {
int n = mbclen(s, send, enc); s += mbclen(s, send, enc);
while (n-- && s < send)
s++;
s--;
continue; continue;
} }
switch (c) { switch (c) {
@ -2400,6 +2401,7 @@ rb_reg_quote(VALUE str)
case '\t': case '\f': case '\v': case '\n': case '\r': case '\t': case '\f': case '\v': case '\n': case '\r':
goto meta_found; goto meta_found;
} }
s += clen;
} }
if (ascii_only && rb_enc_get_index(str) != 0) { if (ascii_only && rb_enc_get_index(str) != 0) {
str = rb_str_new3(str); str = rb_str_new3(str);
@ -2417,16 +2419,16 @@ rb_reg_quote(VALUE str)
memcpy(t, RSTRING_PTR(str), s - RSTRING_PTR(str)); memcpy(t, RSTRING_PTR(str), s - RSTRING_PTR(str));
t += s - RSTRING_PTR(str); t += s - RSTRING_PTR(str);
for (; s < send; s++) { while (s < send) {
c = rb_enc_get_ascii(s, send, enc); c = rb_enc_get_ascii(s, send, &clen, enc);
if (c == -1) { if (c == -1) {
int n = mbclen(s, send, enc); int n = mbclen(s, send, enc);
while (n-- && s < send) while (n--)
*t++ = *s++; *t++ = *s++;
s--;
continue; continue;
} }
s += clen;
switch (c) { switch (c) {
case '[': case ']': case '{': case '}': case '[': case ']': case '{': case '}':
case '(': case ')': case '|': case '-': case '(': case ')': case '|': case '-':
@ -2684,8 +2686,7 @@ rb_reg_regsub(VALUE str, VALUE src, struct re_registers *regs, VALUE regexp)
{ {
VALUE val = 0; VALUE val = 0;
char *p, *s, *e; char *p, *s, *e;
unsigned char uc; int no, clen;
int no;
rb_encoding *enc = rb_enc_check(str, src); rb_encoding *enc = rb_enc_check(str, src);
rb_enc_check(str, regexp); rb_enc_check(str, regexp);
@ -2693,30 +2694,37 @@ rb_reg_regsub(VALUE str, VALUE src, struct re_registers *regs, VALUE regexp)
e = s + RSTRING_LEN(str); e = s + RSTRING_LEN(str);
while (s < e) { while (s < e) {
int c = rb_enc_get_ascii(s, e, enc); int c = rb_enc_get_ascii(s, e, &clen, enc);
char *ss = s++; char *ss;
if (c == -1) { if (c == -1) {
s += mbclen(ss, e, enc) - 1; s += mbclen(s, e, enc);
continue; continue;
} }
ss = s;
s += clen;
if (c != '\\' || s == e) continue; if (c != '\\' || s == e) continue;
if (!val) { if (!val) {
val = rb_str_buf_new(ss-p); val = rb_str_buf_new(ss-p);
rb_str_buf_cat(val, p, ss-p);
} }
else {
rb_str_buf_cat(val, p, ss-p); rb_str_buf_cat(val, p, ss-p);
}
uc = (unsigned char)*s++; c = rb_enc_get_ascii(s, e, &clen, enc);
if (c == -1) {
s += mbclen(s, e, enc);
rb_str_buf_cat(val, ss, s-ss);
continue;
}
s += clen;
p = s; p = s;
switch (uc) { switch (c) {
case '1': case '2': case '3': case '4': case '1': case '2': case '3': case '4':
case '5': case '6': case '7': case '8': case '9': case '5': case '6': case '7': case '8': case '9':
if (onig_noname_group_capture_is_active(RREGEXP(regexp)->ptr)) { if (onig_noname_group_capture_is_active(RREGEXP(regexp)->ptr)) {
no = uc - '0'; no = c - '0';
} }
else { else {
continue; continue;
@ -2724,17 +2732,18 @@ rb_reg_regsub(VALUE str, VALUE src, struct re_registers *regs, VALUE regexp)
break; break;
case 'k': case 'k':
if (s < e && *s == '<') { if (s < e && rb_enc_get_ascii(s, e, &clen, enc) == '<') {
char *name, *name_end; char *name, *name_end;
name_end = name = s + 1; name_end = name = s + clen;
while (name_end < e) { while (name_end < e) {
if (*name_end == '>') break; c = rb_enc_get_ascii(name_end, e, &clen, enc);
name_end += mbclen(name_end, e, enc); if (c == '>') break;
name_end += c == -1 ? mbclen(name_end, e, enc) : clen;
} }
if (name_end < e) { if (name_end < e) {
no = name_to_backref_number(regs, regexp, name, name_end); no = name_to_backref_number(regs, regexp, name, name_end);
p = s = name_end + 1; p = s = name_end + clen;
break; break;
} }
else { else {
@ -2742,7 +2751,7 @@ rb_reg_regsub(VALUE str, VALUE src, struct re_registers *regs, VALUE regexp)
} }
} }
rb_str_buf_cat(val, s-2, 2); rb_str_buf_cat(val, ss, s-ss);
continue; continue;
case '0': case '0':
@ -2765,11 +2774,11 @@ rb_reg_regsub(VALUE str, VALUE src, struct re_registers *regs, VALUE regexp)
break; break;
case '\\': case '\\':
rb_str_buf_cat(val, s-1, 1); rb_str_buf_cat(val, s-clen, clen);
continue; continue;
default: default:
rb_str_buf_cat(val, s-2, 2); rb_str_buf_cat(val, ss, s-ss);
continue; continue;
} }
@ -2783,12 +2792,9 @@ rb_reg_regsub(VALUE str, VALUE src, struct re_registers *regs, VALUE regexp)
if (p < e) { if (p < e) {
if (!val) { if (!val) {
val = rb_str_buf_new(e-p); val = rb_str_buf_new(e-p);
rb_str_buf_cat(val, p, e-p);
} }
else {
rb_str_buf_cat(val, p, e-p); rb_str_buf_cat(val, p, e-p);
} }
}
if (!val) return str; if (!val) return str;
return val; return val;