string.c: improve splitting into chars

* string.c (rb_str_split_m): improve splitting into chars by an
  empty string, without a regexp.

    Comparison:
                           to_chars-1
              built-ruby:   1273527.6 i/s
            compare-ruby:    189423.3 i/s - 6.72x  slower

                          to_chars-10
              built-ruby:    120993.5 i/s
            compare-ruby:     37075.8 i/s - 3.26x  slower

                         to_chars-100
              built-ruby:     15646.4 i/s
            compare-ruby:      4012.1 i/s - 3.90x  slower

                        to_chars-1000
              built-ruby:      1295.1 i/s
            compare-ruby:       408.5 i/s - 3.17x  slower

git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@67582 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
This commit is contained in:
nobu 2019-04-17 05:34:46 +00:00
parent 62c07674e0
commit e1eb54b99d
2 changed files with 27 additions and 10 deletions

View File

@ -0,0 +1,7 @@
prelude: |
str0 = [*0..9].join("")
benchmark:
to_chars-1: str0.split('')
to_chars-10: (str0 * 10).split('')
to_chars-100: (str0 * 100).split('')
to_chars-1000: (str0 * 1000).split('')

View File

@ -7759,7 +7759,7 @@ rb_str_split_m(int argc, VALUE *argv, VALUE str)
rb_encoding *enc; rb_encoding *enc;
VALUE spat; VALUE spat;
VALUE limit; VALUE limit;
enum {awk, string, regexp} split_type; enum {awk, string, regexp, chars} split_type;
long beg, end, i = 0, empty_count = -1; long beg, end, i = 0, empty_count = -1;
int lim = 0; int lim = 0;
VALUE result, tmp; VALUE result, tmp;
@ -7801,8 +7801,7 @@ rb_str_split_m(int argc, VALUE *argv, VALUE str)
split_type = string; split_type = string;
if (RSTRING_LEN(spat) == 0) { if (RSTRING_LEN(spat) == 0) {
/* Special case - split into chars */ /* Special case - split into chars */
spat = rb_reg_regcomp(spat); split_type = chars;
split_type = regexp;
} }
else if (rb_enc_asciicompat(enc2) == 1) { else if (rb_enc_asciicompat(enc2) == 1) {
if (RSTRING_LEN(spat) == 1 && RSTRING_PTR(spat)[0] == ' ') { if (RSTRING_LEN(spat) == 1 && RSTRING_PTR(spat)[0] == ' ') {
@ -7823,9 +7822,9 @@ rb_str_split_m(int argc, VALUE *argv, VALUE str)
if (result) result = rb_ary_new(); if (result) result = rb_ary_new();
beg = 0; beg = 0;
char *ptr = RSTRING_PTR(str);
char *eptr = RSTRING_END(str);
if (split_type == awk) { if (split_type == awk) {
char *ptr = RSTRING_PTR(str);
char *eptr = RSTRING_END(str);
char *bptr = ptr; char *bptr = ptr;
int skip = 1; int skip = 1;
unsigned int c; unsigned int c;
@ -7884,10 +7883,8 @@ rb_str_split_m(int argc, VALUE *argv, VALUE str)
} }
} }
else if (split_type == string) { else if (split_type == string) {
char *ptr = RSTRING_PTR(str);
char *str_start = ptr; char *str_start = ptr;
char *substr_start = ptr; char *substr_start = ptr;
char *eptr = RSTRING_END(str);
char *sptr = RSTRING_PTR(spat); char *sptr = RSTRING_PTR(spat);
long slen = RSTRING_LEN(spat); long slen = RSTRING_LEN(spat);
@ -7908,8 +7905,21 @@ rb_str_split_m(int argc, VALUE *argv, VALUE str)
} }
beg = ptr - str_start; beg = ptr - str_start;
} }
else if (split_type == chars) {
char *str_start = ptr;
int n;
mustnot_broken(str);
enc = rb_enc_get(str);
while (ptr < eptr &&
(n = rb_enc_precise_mbclen(ptr, eptr, enc)) > 0) {
SPLIT_STR(ptr - str_start, n);
ptr += n;
if (!NIL_P(limit) && lim <= ++i) break;
}
beg = ptr - str_start;
}
else { else {
char *ptr = RSTRING_PTR(str);
long len = RSTRING_LEN(str); long len = RSTRING_LEN(str);
long start = beg; long start = beg;
long idx; long idx;
@ -7924,14 +7934,14 @@ rb_str_split_m(int argc, VALUE *argv, VALUE str)
break; break;
} }
else if (last_null == 1) { else if (last_null == 1) {
SPLIT_STR(beg, rb_enc_fast_mbclen(ptr+beg, ptr+len, enc)); SPLIT_STR(beg, rb_enc_fast_mbclen(ptr+beg, eptr, enc));
beg = start; beg = start;
} }
else { else {
if (start == len) if (start == len)
start++; start++;
else else
start += rb_enc_fast_mbclen(ptr+start,ptr+len,enc); start += rb_enc_fast_mbclen(ptr+start,eptr,enc);
last_null = 1; last_null = 1;
continue; continue;
} }