* string.c (rb_str_succ): don't increment/decrement codepoint.

git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@15268 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
This commit is contained in:
akr 2008-01-27 08:21:24 +00:00
parent 4a9d407e35
commit b1e6c052cd
3 changed files with 170 additions and 92 deletions

View File

@ -1,3 +1,7 @@
Sun Jan 27 17:20:10 2008 Tanaka Akira <akr@fsij.org>
* string.c (rb_str_succ): don't increment/decrement codepoint.
Sun Jan 27 16:03:42 2008 NARUSE, Yui <naruse@ruby-lang.org> Sun Jan 27 16:03:42 2008 NARUSE, Yui <naruse@ruby-lang.org>
* lib/irb/ruby-lex.rb (RubyLex#buf_input): use chars.to_a. * lib/irb/ruby-lex.rb (RubyLex#buf_input): use chars.to_a.

233
string.c
View File

@ -2000,74 +2000,143 @@ rb_str_match_m(int argc, VALUE *argv, VALUE str)
return result; return result;
} }
static int enum neighbor_char {
succ_char(char *s) NEIGHBOR_NOT_CHAR,
{ NEIGHBOR_FOUND,
char c = *s; NEIGHBOR_WRAPPED
};
/* numerics */ static enum neighbor_char
if ('0' <= c && c < '9') (*s)++; enc_succ_char(char *p, int len, rb_encoding *enc)
else if (c == '9') { {
*s = '0'; int i, l;
return '1'; while (1) {
for (i = len-1; 0 <= i; i--) {
int c;
c = ++((unsigned char*)p)[i];
if (c != 0)
break;
} }
/* small alphabets */ if (i < 0)
else if ('a' <= c && c < 'z') (*s)++; return NEIGHBOR_WRAPPED;
else if (c == 'z') { l = rb_enc_precise_mbclen(p, p+len, enc);
return *s = 'a'; if (MBCLEN_CHARFOUND(l)) {
if (l == len) {
return NEIGHBOR_FOUND;
}
else {
memset(p+l, '\xff', len-l);
}
}
if (MBCLEN_INVALID(l) && i < len-1) {
int len2, l2;
for (len2 = len-1; 0 < len2; len2--) {
l2 = rb_enc_precise_mbclen(p, p+len2, enc);
if (!MBCLEN_INVALID(l2))
break;
}
memset(p+len2+1, '\xff', len-(len2+1));
}
}
}
static enum neighbor_char
enc_pred_char(char *p, int len, rb_encoding *enc)
{
int i, l;
while (1) {
for (i = len-1; 0 <= i; i--) {
int c;
c = --((unsigned char*)p)[i];
if (c != 0xff)
break;
}
if (i < 0)
return NEIGHBOR_WRAPPED;
l = rb_enc_precise_mbclen(p, p+len, enc);
if (MBCLEN_CHARFOUND(l)) {
if (l == len) {
return NEIGHBOR_FOUND;
}
else {
memset(p+l, '\0', len-l);
}
}
if (MBCLEN_INVALID(l) && i < len-1) {
int len2, l2;
for (len2 = len-1; 0 < len2; len2--) {
l2 = rb_enc_precise_mbclen(p, p+len2, enc);
if (!MBCLEN_INVALID(l2))
break;
}
memset(p+len2+1, '\0', len-(len2+1));
} }
/* capital alphabets */
else if ('A' <= c && c < 'Z') (*s)++;
else if (c == 'Z') {
return *s = 'A';
} }
return 0;
} }
/* /*
overwrite +s+ by succeeding letter of +c+ in +enc+ and returns overwrite +p+ by succeeding letter in +enc+ and returns
carried-out letter. assuming each ranges are successive, and mbclen NEIGHBOR_FOUND or NEIGHBOR_WRAPPED.
When NEIGHBOR_WRAPPED, carried-out letter is stored into carry.
assuming each ranges are successive, and mbclen
never change in each ranges. never change in each ranges.
NEIGHBOR_NOT_CHAR is returned if invalid character or the range has only one
character.
*/ */
static int static enum neighbor_char
enc_succ_char(unsigned int c, char *s, rb_encoding *enc) enc_succ_alnum_char(char *p, int len, rb_encoding *enc, char *carry)
{ {
unsigned int cs; enum neighbor_char ret;
int c;
int ctype;
int range;
char save[ONIGENC_CODE_TO_MBC_MAXLEN];
/* numerics */ c = rb_enc_mbc_to_codepoint(p, p+len, enc);
if (rb_enc_isdigit(c, enc)) { if (rb_enc_isctype(c, ONIGENC_CTYPE_DIGIT, enc))
cs = c++; ctype = ONIGENC_CTYPE_DIGIT;
if (rb_enc_isdigit(c, enc)) { else if (rb_enc_isctype(c, ONIGENC_CTYPE_ALPHA, enc))
rb_enc_mbcput(c, s, enc); ctype = ONIGENC_CTYPE_ALPHA;
return 0; else
return NEIGHBOR_NOT_CHAR;
MEMCPY(save, p, char, len);
ret = enc_succ_char(p, len, enc);
if (ret == NEIGHBOR_FOUND) {
c = rb_enc_mbc_to_codepoint(p, p+len, enc);
if (rb_enc_isctype(c, ctype, enc))
return NEIGHBOR_FOUND;
} }
do c = cs--; while (rb_enc_isdigit(cs, enc)); MEMCPY(p, save, char, len);
rb_enc_mbcput(c, s, enc); range = 1;
return ++c; while (1) {
MEMCPY(save, p, char, len);
ret = enc_pred_char(p, len, enc);
if (ret == NEIGHBOR_FOUND) {
c = rb_enc_mbc_to_codepoint(p, p+len, enc);
if (!rb_enc_isctype(c, ctype, enc)) {
MEMCPY(p, save, char, len);
break;
} }
/* small alphabets */
if (rb_enc_islower(c, enc)) {
cs = c++;
if (rb_enc_islower(c, enc)) {
rb_enc_mbcput(c, s, enc);
return 0;
} }
do c = cs--; while (rb_enc_islower(cs, enc)); else {
rb_enc_mbcput(c, s, enc); MEMCPY(p, save, char, len);
return c; break;
} }
/* capital alphabets */ range++;
if (rb_enc_isupper(c, enc)) {
cs = c++;
if (rb_enc_isupper(c, enc)) {
rb_enc_mbcput(c, s, enc);
return 0;
} }
do c = cs--; while (rb_enc_isupper(cs, enc)); if (range == 1) {
rb_enc_mbcput(c, s, enc); return NEIGHBOR_NOT_CHAR;
return c;
} }
return -1;
if (ctype != ONIGENC_CTYPE_DIGIT) {
MEMCPY(carry, p, char, len);
return NEIGHBOR_WRAPPED;
}
MEMCPY(carry, p, char, len);
enc_succ_char(carry, len, enc);
return NEIGHBOR_WRAPPED;
} }
@ -2103,9 +2172,9 @@ rb_str_succ(VALUE orig)
VALUE str; VALUE str;
char *sbeg, *s, *e; char *sbeg, *s, *e;
int c = -1; int c = -1;
unsigned int cc = 0; long l;
long n = 0, o = 0, l;
char carry[ONIGENC_CODE_TO_MBC_MAXLEN]; char carry[ONIGENC_CODE_TO_MBC_MAXLEN];
int carry_pos, carry_len;
str = rb_str_new5(orig, RSTRING_PTR(orig), RSTRING_LEN(orig)); str = rb_str_new5(orig, RSTRING_PTR(orig), RSTRING_LEN(orig));
rb_enc_copy(str, orig); rb_enc_copy(str, orig);
@ -2117,41 +2186,45 @@ rb_str_succ(VALUE orig)
s = e = sbeg + RSTRING_LEN(str); s = e = sbeg + RSTRING_LEN(str);
while ((s = rb_enc_prev_char(sbeg, s, enc)) != 0) { while ((s = rb_enc_prev_char(sbeg, s, enc)) != 0) {
enum neighbor_char neighbor;
if ((l = rb_enc_precise_mbclen(s, e, enc)) <= 0) continue; if ((l = rb_enc_precise_mbclen(s, e, enc)) <= 0) continue;
cc = rb_enc_mbc_to_codepoint(s, e, enc); neighbor = enc_succ_alnum_char(s, l, enc, carry);
if (rb_enc_isalnum(cc, enc)) { if (neighbor == NEIGHBOR_NOT_CHAR)
if (rb_enc_isascii(cc, enc)) { continue;
if ((c = succ_char(s)) == 0) break; if (neighbor == NEIGHBOR_FOUND)
} return str;
else { c = 1;
if ((c = enc_succ_char(cc, s, enc)) == 0) break; carry_pos = s - sbeg;
} carry_len = l;
n = s - sbeg;
}
} }
if (c == -1) { /* str contains no alnum */ if (c == -1) { /* str contains no alnum */
c = '\001'; carry[0] = '\001';
carry_len = 1;
s = e; s = e;
while ((s = rb_enc_prev_char(sbeg, s, enc)) != 0) { while ((s = rb_enc_prev_char(sbeg, s, enc)) != 0) {
int limit = 256; enum neighbor_char neighbor;
if ((l = rb_enc_precise_mbclen(s, e, enc)) <= 0) continue; if ((l = rb_enc_precise_mbclen(s, e, enc)) <= 0) continue;
cc = rb_enc_mbc_to_codepoint(s, e, enc); neighbor = enc_succ_char(s, l, enc);
while ((l = rb_enc_mbcput(++cc, carry, enc)) < 0 && --limit); if (neighbor == NEIGHBOR_FOUND)
if (l > 0) { return str;
if (l == (o = e - s)) goto overlay; if (rb_enc_precise_mbclen(s, s+l, enc) != l) {
n = s - sbeg; /* wrapped to \0...\0. search next valid char. */
goto insert; enc_succ_char(s, l, enc);
}
c = 1;
carry_pos = s - sbeg;
}
if (c == -1) {
c = 1;
carry_pos = 0;
} }
} }
} if (!s && c == 1) {
if (!s && (l = rb_enc_mbcput(c, carry, enc)) > 0) { RESIZE_CAPA(str, RSTRING_LEN(str) + carry_len);
insert: s = RSTRING_PTR(str) + carry_pos;
RESIZE_CAPA(str, RSTRING_LEN(str) + l - o); memmove(s + carry_len, s, RSTRING_LEN(str) - carry_pos);
s = RSTRING_PTR(str) + n; memmove(s, carry, carry_len);
memmove(s + l, s + o, RSTRING_LEN(str) - n - o); STR_SET_LEN(str, RSTRING_LEN(str) + carry_len);
overlay:
memmove(s, carry, l);
STR_SET_LEN(str, RSTRING_LEN(str) + l - o);
RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0'; RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
} }

View File

@ -1349,10 +1349,6 @@ class TestM17NComb < Test::Unit::TestCase
end end
def test_str_succ def test_str_succ
starts = [
e("\xA1\xA1"),
e("\xFE\xFE")
]
STRINGS.each {|s0| STRINGS.each {|s0|
next if s0.empty? next if s0.empty?
s = s0.dup s = s0.dup
@ -1360,11 +1356,16 @@ class TestM17NComb < Test::Unit::TestCase
h = {} h = {}
n.times {|i| n.times {|i|
if h[s] if h[s]
assert(false, "#{encdump s} cycle with succ! #{i-h[s]} times") assert(false, "#{encdump s} cycle with succ #{i-h[s]} times")
end end
h[s] = i h[s] = i
assert_operator(s.length, :<=, s0.length + Math.log2(i+1) + 1, "#{encdump s0} succ! #{i} times => #{encdump s}") assert_operator(s.length, :<=, s0.length + Math.log2(i+1) + 1, "#{encdump s0} succ #{i} times => #{encdump s}")
s.succ! #puts encdump(s)
t = s.succ
if s.valid_encoding?
assert(t.valid_encoding?, "#{encdump s}.succ.valid_encoding?")
end
s = t
} }
} }
end end