String#force_encoding don't clear coderange if encoding is unchanged

Some code out there blind calls `force_encoding` without checking
what the original encoding was, which clears the coderange uselessly.

If the String is big, it can be a rather costly mistake.

For instance the `rack-utf8_sanitizer` gem does this on request
bodies.
This commit is contained in:
Jean Boussier 2023-11-08 14:05:52 +01:00 committed by Jean Boussier
parent 0a7e620a36
commit ea1b1ea1aa
3 changed files with 19 additions and 2 deletions

View File

@ -10843,7 +10843,23 @@ static VALUE
rb_str_force_encoding(VALUE str, VALUE enc)
{
str_modifiable(str);
rb_enc_associate(str, rb_to_encoding(enc));
rb_encoding *encoding = rb_to_encoding(enc);
int idx = rb_enc_to_index(encoding);
// If the encoding is unchanged, we do nothing.
if (ENCODING_GET(str) == idx) {
return str;
}
rb_enc_associate_index(str, idx);
// If the coderange was 7bit and the new encoding is ASCII-compatible
// we can keep the coderange.
if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT && encoding && rb_enc_asciicompat(encoding)) {
return str;
}
ENC_CODERANGE_CLEAR(str);
return str;
}

View File

@ -585,7 +585,7 @@ class TestObjSpace < Test::Unit::TestCase
def test_dump_string_coderange
assert_includes ObjectSpace.dump("TEST STRING"), '"coderange":"7bit"'
unknown = "TEST STRING".dup.force_encoding(Encoding::BINARY)
unknown = "TEST STRING".dup.force_encoding(Encoding::UTF_16BE)
2.times do # ensure that dumping the string doesn't mutate it
assert_includes ObjectSpace.dump(unknown), '"coderange":"unknown"'
end

View File

@ -2747,6 +2747,7 @@ EOS
require 'objspace'
begin;
obj = "a" * 12
obj.force_encoding(Encoding::UTF_16LE)
obj.force_encoding(Encoding::BINARY)
assert_include(ObjectSpace.dump(obj), '"coderange":"unknown"')
Process.warmup