Factor out get_reg_grapheme_cluster

git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@62893 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
This commit is contained in:
naruse 2018-03-22 07:58:39 +00:00
parent 41b2ef4685
commit 42f1b58964

View File

@ -8355,20 +8355,12 @@ rb_str_codepoints(VALUE str)
return rb_str_enumerate_codepoints(str, ary); return rb_str_enumerate_codepoints(str, ary);
} }
static VALUE static regex_t *
rb_str_each_grapheme_cluster_size(VALUE str, VALUE args, VALUE eobj) get_reg_grapheme_cluster(rb_encoding *enc)
{ {
long grapheme_cluster_count = 0; int encidx = rb_enc_to_index(enc);
regex_t *reg_grapheme_cluster = NULL; regex_t *reg_grapheme_cluster = NULL;
static regex_t *reg_grapheme_cluster_utf8 = NULL; static regex_t *reg_grapheme_cluster_utf8 = NULL;
int encidx = ENCODING_GET(str);
rb_encoding *enc = rb_enc_from_index(encidx);
int unicode_p = rb_enc_unicode_p(enc);
const char *ptr, *end;
if (!unicode_p || single_byte_optimizable(str)) {
return rb_str_length(str);
}
/* synchronize */ /* synchronize */
if (encidx == rb_utf8_encindex() && reg_grapheme_cluster_utf8) { if (encidx == rb_utf8_encindex() && reg_grapheme_cluster_utf8) {
@ -8385,7 +8377,22 @@ rb_str_each_grapheme_cluster_size(VALUE str, VALUE args, VALUE eobj)
reg_grapheme_cluster_utf8 = reg_grapheme_cluster; reg_grapheme_cluster_utf8 = reg_grapheme_cluster;
} }
} }
return reg_grapheme_cluster;
}
static VALUE
rb_str_each_grapheme_cluster_size(VALUE str, VALUE args, VALUE eobj)
{
size_t grapheme_cluster_count = 0;
regex_t *reg_grapheme_cluster = NULL;
rb_encoding *enc = rb_enc_from_index(ENCODING_GET(str));
const char *ptr, *end;
if (!rb_enc_unicode_p(enc) || single_byte_optimizable(str)) {
return rb_str_length(str);
}
reg_grapheme_cluster = get_reg_grapheme_cluster(enc);
ptr = RSTRING_PTR(str); ptr = RSTRING_PTR(str);
end = RSTRING_END(str); end = RSTRING_END(str);
@ -8393,16 +8400,12 @@ rb_str_each_grapheme_cluster_size(VALUE str, VALUE args, VALUE eobj)
OnigPosition len = onig_match(reg_grapheme_cluster, OnigPosition len = onig_match(reg_grapheme_cluster,
(const OnigUChar *)ptr, (const OnigUChar *)end, (const OnigUChar *)ptr, (const OnigUChar *)end,
(const OnigUChar *)ptr, NULL, 0); (const OnigUChar *)ptr, NULL, 0);
if (len == 0) break; if (len <= 0) break;
if (len < 0) {
break;
}
grapheme_cluster_count++; grapheme_cluster_count++;
ptr += len; ptr += len;
} }
RB_GC_GUARD(str);
return LONG2NUM(grapheme_cluster_count); return SIZET2NUM(grapheme_cluster_count);
} }
static VALUE static VALUE
@ -8410,33 +8413,15 @@ rb_str_enumerate_grapheme_clusters(VALUE str, VALUE ary)
{ {
VALUE orig = str; VALUE orig = str;
regex_t *reg_grapheme_cluster = NULL; regex_t *reg_grapheme_cluster = NULL;
static regex_t *reg_grapheme_cluster_utf8 = NULL; rb_encoding *enc = rb_enc_from_index(ENCODING_GET(str));
int encidx = ENCODING_GET(str);
rb_encoding *enc = rb_enc_from_index(encidx);
int unicode_p = rb_enc_unicode_p(enc);
const char *ptr, *end; const char *ptr, *end;
if (!unicode_p || single_byte_optimizable(str)) { if (!rb_enc_unicode_p(enc) || single_byte_optimizable(str)) {
return rb_str_enumerate_chars(str, ary); return rb_str_enumerate_chars(str, ary);
} }
/* synchronize */
if (encidx == rb_utf8_encindex() && reg_grapheme_cluster_utf8) {
reg_grapheme_cluster = reg_grapheme_cluster_utf8;
}
if (!reg_grapheme_cluster) {
const OnigUChar source[] = "\\X";
int r = onig_new(&reg_grapheme_cluster, source, source + sizeof(source) - 1,
ONIG_OPTION_DEFAULT, enc, OnigDefaultSyntax, NULL);
if (r) {
rb_bug("cannot compile grapheme cluster regexp");
}
if (encidx == rb_utf8_encindex()) {
reg_grapheme_cluster_utf8 = reg_grapheme_cluster;
}
}
if (!ary) str = rb_str_new_frozen(str); if (!ary) str = rb_str_new_frozen(str);
reg_grapheme_cluster = get_reg_grapheme_cluster(enc);
ptr = RSTRING_PTR(str); ptr = RSTRING_PTR(str);
end = RSTRING_END(str); end = RSTRING_END(str);
@ -8444,10 +8429,7 @@ rb_str_enumerate_grapheme_clusters(VALUE str, VALUE ary)
OnigPosition len = onig_match(reg_grapheme_cluster, OnigPosition len = onig_match(reg_grapheme_cluster,
(const OnigUChar *)ptr, (const OnigUChar *)end, (const OnigUChar *)ptr, (const OnigUChar *)end,
(const OnigUChar *)ptr, NULL, 0); (const OnigUChar *)ptr, NULL, 0);
if (len == 0) break; if (len <= 0) break;
if (len < 0) {
break;
}
ENUM_ELEM(ary, rb_enc_str_new(ptr, len, enc)); ENUM_ELEM(ary, rb_enc_str_new(ptr, len, enc));
ptr += len; ptr += len;
} }