* enc/unicode.c: Removed artificial expansion for Turkic,
added hand-coded support for Turkic, fixed logic for swapcase. * string.c: Made use of new case mapping code possible from upcase, capitalize, and swapcase (with :lithuanian as a guard). * test/ruby/enc/test_case_mapping.rb: Adjusted for above. (with Kimihito Matsui) git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@53562 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
This commit is contained in:
parent
0bc5341690
commit
959bbb6f72
@ -1,3 +1,12 @@
|
|||||||
|
Sun Jan 17 17:41:41 2016 Martin Duerst <duerst@it.aoyama.ac.jp>
|
||||||
|
|
||||||
|
* enc/unicode.c: Removed artificial expansion for Turkic,
|
||||||
|
added hand-coded support for Turkic, fixed logic for swapcase.
|
||||||
|
* string.c: Made use of new case mapping code possible from upcase,
|
||||||
|
capitalize, and swapcase (with :lithuanian as a guard).
|
||||||
|
* test/ruby/enc/test_case_mapping.rb: Adjusted for above.
|
||||||
|
(with Kimihito Matsui)
|
||||||
|
|
||||||
Sun Jan 17 15:30:57 2016 Nobuyoshi Nakada <nobu@ruby-lang.org>
|
Sun Jan 17 15:30:57 2016 Nobuyoshi Nakada <nobu@ruby-lang.org>
|
||||||
|
|
||||||
* ext/socket/option.c (sockopt_bool): relax boolean size to be one
|
* ext/socket/option.c (sockopt_bool): relax boolean size to be one
|
||||||
|
@ -606,9 +606,7 @@ onigenc_unicode_get_case_fold_codes_by_str(OnigEncoding enc,
|
|||||||
|
|
||||||
/* length in bytes for three characters in UTF-32; e.g. needed for ffi (U+FB03) */
|
/* length in bytes for three characters in UTF-32; e.g. needed for ffi (U+FB03) */
|
||||||
#define CASE_MAPPING_SLACK 12
|
#define CASE_MAPPING_SLACK 12
|
||||||
/* The following declaration should be moved to an include file rather than
|
#define MODIFIED (flags |= ONIGENC_CASE_MODIFIED)
|
||||||
be duplicated here (and in string.c), but we'll wait for this because we
|
|
||||||
want this to become a primitive anyway. */
|
|
||||||
extern int
|
extern int
|
||||||
onigenc_unicode_case_map(OnigCaseFoldType* flagP,
|
onigenc_unicode_case_map(OnigCaseFoldType* flagP,
|
||||||
const OnigUChar** pp, const OnigUChar* end,
|
const OnigUChar** pp, const OnigUChar* end,
|
||||||
@ -620,29 +618,52 @@ onigenc_unicode_case_map(OnigCaseFoldType* flagP,
|
|||||||
OnigCaseFoldType flags = *flagP;
|
OnigCaseFoldType flags = *flagP;
|
||||||
to_end -= CASE_MAPPING_SLACK;
|
to_end -= CASE_MAPPING_SLACK;
|
||||||
|
|
||||||
/* hopelessly preliminary implementation, just dealing with ASCII,
|
/* hopelessly preliminary implementation, just dealing with ASCII and Turkic */
|
||||||
* and just for downcase */
|
|
||||||
while (*pp<end && to<=to_end) {
|
while (*pp<end && to<=to_end) {
|
||||||
code = ONIGENC_MBC_TO_CODE(enc, *pp, end);
|
code = ONIGENC_MBC_TO_CODE(enc, *pp, end);
|
||||||
*pp += enclen(enc, *pp, end);
|
*pp += enclen(enc, *pp, end);
|
||||||
/* using :turcic to test buffer expansion */
|
if (code<='z') { /* ASCII comes first */
|
||||||
if (flags&ONIGENC_CASE_FOLD_TURKISH_AZERI && code==0x0049) { /* I */
|
if (code>='a' && code<='z') {
|
||||||
to += ONIGENC_CODE_TO_MBC(enc, 'T', to);
|
if (flags&ONIGENC_CASE_UPCASE) {
|
||||||
to += ONIGENC_CODE_TO_MBC(enc, 'U', to);
|
if (flags&ONIGENC_CASE_FOLD_TURKISH_AZERI && code==0x0069) /* i → İ */
|
||||||
to += ONIGENC_CODE_TO_MBC(enc, 'R', to);
|
code = 0x0130;
|
||||||
to += ONIGENC_CODE_TO_MBC(enc, 'K', to);
|
else
|
||||||
to += ONIGENC_CODE_TO_MBC(enc, 'I', to);
|
code += 'A'-'a';
|
||||||
to += ONIGENC_CODE_TO_MBC(enc, 'S', to);
|
MODIFIED;
|
||||||
to += ONIGENC_CODE_TO_MBC(enc, 'H', to);
|
}
|
||||||
to += ONIGENC_CODE_TO_MBC(enc, '*', to);
|
}
|
||||||
code = 0x0131;
|
else if (code>='A' && code<='Z') {
|
||||||
flags |= ONIGENC_CASE_MODIFIED;
|
if (flags&ONIGENC_CASE_DOWNCASE) {
|
||||||
|
if (flags&ONIGENC_CASE_FOLD_TURKISH_AZERI && code==0x0049) /* I → ı */
|
||||||
|
code = 0x0131;
|
||||||
|
else
|
||||||
|
code += 'a'-'A';
|
||||||
|
MODIFIED;
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
else if (code>='A' && code<='Z') {
|
else if (code>=0x00C0) { /* deal with non-ASCII; nothing relevant below U+00C0 */
|
||||||
code += 'a'-'A';
|
if (code==0x0130) { /* İ → i */
|
||||||
flags |= ONIGENC_CASE_MODIFIED;
|
if (flags&ONIGENC_CASE_UPCASE) {
|
||||||
|
if (flags&ONIGENC_CASE_FOLD_TURKISH_AZERI)
|
||||||
|
code = 0x0069;
|
||||||
|
else { /* make dot above explicit */
|
||||||
|
to += ONIGENC_CODE_TO_MBC(enc, 0x0069, to);
|
||||||
|
code = 0x0307; /* dot above */
|
||||||
|
}
|
||||||
|
MODIFIED;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
/* the following case can be removed once we rely on data,
|
||||||
|
* because the mapping is always the same */
|
||||||
|
else if (code==0x0131 && flags&ONIGENC_CASE_UPCASE) { /* ı → I */
|
||||||
|
code = 0x0049; MODIFIED;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
to += ONIGENC_CODE_TO_MBC(enc, code, to);
|
to += ONIGENC_CODE_TO_MBC(enc, code, to);
|
||||||
|
/* switch from titlecase to lowercase for capitalize */
|
||||||
|
if (flags & ONIGENC_CASE_TITLECASE)
|
||||||
|
flags ^= (ONIGENC_CASE_UPCASE|ONIGENC_CASE_TITLECASE|ONIGENC_CASE_DOWNCASE);
|
||||||
}
|
}
|
||||||
*flagP = flags;
|
*flagP = flags;
|
||||||
return (int)(to-to_start);
|
return (int)(to-to_start);
|
||||||
|
42
string.c
42
string.c
@ -5734,7 +5734,11 @@ rb_str_upcase_bang(int argc, VALUE *argv, VALUE str)
|
|||||||
enc = STR_ENC_GET(str);
|
enc = STR_ENC_GET(str);
|
||||||
rb_str_check_dummy_enc(enc);
|
rb_str_check_dummy_enc(enc);
|
||||||
s = RSTRING_PTR(str); send = RSTRING_END(str);
|
s = RSTRING_PTR(str); send = RSTRING_END(str);
|
||||||
if (single_byte_optimizable(str)) {
|
if (enc==rb_utf8_encoding() && flags&ONIGENC_CASE_FOLD_LITHUANIAN) { /* lithuanian temporarily used as a guard for debugging */
|
||||||
|
str_shared_replace(str, rb_str_casemap(str, &flags, enc));
|
||||||
|
modify = ONIGENC_CASE_MODIFIED & flags;
|
||||||
|
}
|
||||||
|
else if (single_byte_optimizable(str)) {
|
||||||
while (s < send) {
|
while (s < send) {
|
||||||
unsigned int c = *(unsigned char*)s;
|
unsigned int c = *(unsigned char*)s;
|
||||||
|
|
||||||
@ -5817,7 +5821,7 @@ rb_str_downcase_bang(int argc, VALUE *argv, VALUE str)
|
|||||||
enc = STR_ENC_GET(str);
|
enc = STR_ENC_GET(str);
|
||||||
rb_str_check_dummy_enc(enc);
|
rb_str_check_dummy_enc(enc);
|
||||||
s = RSTRING_PTR(str); send = RSTRING_END(str);
|
s = RSTRING_PTR(str); send = RSTRING_END(str);
|
||||||
if (/*enc==rb_utf8_encoding() &&*/ flags&ONIGENC_CASE_FOLD_LITHUANIAN) { /* lithuanian temporarily used as a guard for debugging */
|
if (enc==rb_utf8_encoding() && flags&ONIGENC_CASE_FOLD_LITHUANIAN) { /* lithuanian temporarily used as a guard for debugging */
|
||||||
str_shared_replace(str, rb_str_casemap(str, &flags, enc));
|
str_shared_replace(str, rb_str_casemap(str, &flags, enc));
|
||||||
modify = ONIGENC_CASE_MODIFIED & flags;
|
modify = ONIGENC_CASE_MODIFIED & flags;
|
||||||
}
|
}
|
||||||
@ -5906,29 +5910,33 @@ rb_str_capitalize_bang(int argc, VALUE *argv, VALUE str)
|
|||||||
int modify = 0;
|
int modify = 0;
|
||||||
unsigned int c;
|
unsigned int c;
|
||||||
int n;
|
int n;
|
||||||
OnigCaseFoldType flags = ONIGENC_CASE_UPCASE |
|
OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_TITLECASE;
|
||||||
ONIGENC_CASE_TITLECASE | ONIGENC_CASE_ONCEONLY;
|
|
||||||
|
|
||||||
flags = check_case_options(argc, argv, flags);
|
flags = check_case_options(argc, argv, flags);
|
||||||
str_modify_keep_cr(str);
|
str_modify_keep_cr(str);
|
||||||
enc = STR_ENC_GET(str);
|
enc = STR_ENC_GET(str);
|
||||||
rb_str_check_dummy_enc(enc);
|
rb_str_check_dummy_enc(enc);
|
||||||
if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
|
if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
|
||||||
s = RSTRING_PTR(str); send = RSTRING_END(str);
|
if (enc==rb_utf8_encoding() && flags&ONIGENC_CASE_FOLD_LITHUANIAN) { /* lithuanian temporarily used as a guard for debugging */
|
||||||
|
str_shared_replace(str, rb_str_casemap(str, &flags, enc));
|
||||||
c = rb_enc_codepoint_len(s, send, &n, enc);
|
modify = ONIGENC_CASE_MODIFIED & flags;
|
||||||
if (rb_enc_islower(c, enc)) {
|
|
||||||
rb_enc_mbcput(rb_enc_toupper(c, enc), s, enc);
|
|
||||||
modify = 1;
|
|
||||||
}
|
}
|
||||||
s += n;
|
else {
|
||||||
while (s < send) {
|
s = RSTRING_PTR(str); send = RSTRING_END(str);
|
||||||
c = rb_enc_codepoint_len(s, send, &n, enc);
|
c = rb_enc_codepoint_len(s, send, &n, enc);
|
||||||
if (rb_enc_isupper(c, enc)) {
|
if (rb_enc_islower(c, enc)) {
|
||||||
rb_enc_mbcput(rb_enc_tolower(c, enc), s, enc);
|
rb_enc_mbcput(rb_enc_toupper(c, enc), s, enc);
|
||||||
modify = 1;
|
modify = 1;
|
||||||
}
|
}
|
||||||
s += n;
|
s += n;
|
||||||
|
while (s < send) {
|
||||||
|
c = rb_enc_codepoint_len(s, send, &n, enc);
|
||||||
|
if (rb_enc_isupper(c, enc)) {
|
||||||
|
rb_enc_mbcput(rb_enc_tolower(c, enc), s, enc);
|
||||||
|
modify = 1;
|
||||||
|
}
|
||||||
|
s += n;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (modify) return str;
|
if (modify) return str;
|
||||||
@ -5981,7 +5989,11 @@ rb_str_swapcase_bang(int argc, VALUE *argv, VALUE str)
|
|||||||
enc = STR_ENC_GET(str);
|
enc = STR_ENC_GET(str);
|
||||||
rb_str_check_dummy_enc(enc);
|
rb_str_check_dummy_enc(enc);
|
||||||
s = RSTRING_PTR(str); send = RSTRING_END(str);
|
s = RSTRING_PTR(str); send = RSTRING_END(str);
|
||||||
while (s < send) {
|
if (enc==rb_utf8_encoding() && flags&ONIGENC_CASE_FOLD_LITHUANIAN) { /* lithuanian temporarily used as a guard for debugging */
|
||||||
|
str_shared_replace(str, rb_str_casemap(str, &flags, enc));
|
||||||
|
modify = ONIGENC_CASE_MODIFIED & flags;
|
||||||
|
}
|
||||||
|
else while (s < send) {
|
||||||
unsigned int c = rb_enc_codepoint_len(s, send, &n, enc);
|
unsigned int c = rb_enc_codepoint_len(s, send, &n, enc);
|
||||||
|
|
||||||
if (rb_enc_isupper(c, enc)) {
|
if (rb_enc_isupper(c, enc)) {
|
||||||
|
@ -5,14 +5,25 @@ require "test/unit"
|
|||||||
# preliminary tests, using :lithuanian as a guard
|
# preliminary tests, using :lithuanian as a guard
|
||||||
# to test new implementation strategy
|
# to test new implementation strategy
|
||||||
class TestCaseMappingPreliminary < Test::Unit::TestCase
|
class TestCaseMappingPreliminary < Test::Unit::TestCase
|
||||||
def test_case_mapping_preliminary
|
def test_ascii
|
||||||
assert_equal 'yukihiro matsumoto (matz)',
|
assert_equal 'yukihiro matsumoto (matz)',
|
||||||
'Yukihiro MATSUMOTO (MATZ)'.downcase(:lithuanian)
|
'Yukihiro MATSUMOTO (MATZ)'.downcase(:lithuanian)
|
||||||
assert_equal 'matsumoto yukTURKISH*ıhTURKISH*ıro (matz)',
|
assert_equal 'YUKIHIRO MATSUMOTO (MATZ)',
|
||||||
'MATSUMOTO YUKIHIRO (MATZ)'.downcase(:turkic, :lithuanian)
|
'yukihiro matsumoto (matz)'.upcase(:lithuanian)
|
||||||
|
assert_equal 'Yukihiro matsumoto (matz)',
|
||||||
|
'yukihiro MATSUMOTO (MATZ)'.capitalize(:lithuanian)
|
||||||
|
assert_equal 'yUKIHIRO matsumoto (MAtz)',
|
||||||
|
'Yukihiro MATSUMOTO (maTZ)'.swapcase(:lithuanian)
|
||||||
end
|
end
|
||||||
|
|
||||||
def test_buffer_allocations
|
def test_turcic
|
||||||
|
assert_equal 'yukihiro matsumoto (matz)',
|
||||||
|
'Yukihiro MATSUMOTO (MATZ)'.downcase(:turkic, :lithuanian)
|
||||||
|
assert_equal 'YUKİHİRO MATSUMOTO (MATZ)',
|
||||||
|
'Yukihiro Matsumoto (matz)'.upcase(:turkic, :lithuanian)
|
||||||
|
end
|
||||||
|
|
||||||
|
def no_longer_a_test_buffer_allocations
|
||||||
assert_equal 'TURKISH*ı'*10, ('I'*10).downcase(:turkic, :lithuanian)
|
assert_equal 'TURKISH*ı'*10, ('I'*10).downcase(:turkic, :lithuanian)
|
||||||
assert_equal 'TURKISH*ı'*100, ('I'*100).downcase(:turkic, :lithuanian)
|
assert_equal 'TURKISH*ı'*100, ('I'*100).downcase(:turkic, :lithuanian)
|
||||||
assert_equal 'TURKISH*ı'*1_000, ('I'*1_000).downcase(:turkic, :lithuanian)
|
assert_equal 'TURKISH*ı'*1_000, ('I'*1_000).downcase(:turkic, :lithuanian)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user