ruby/internal/string.h
John Hawthorn 57b6a7503f Lock-free hash set for fstrings [Feature #21268]
This implements a hash set which is wait-free for lookup and lock-free
for insert (unless resizing) to use for fstring de-duplication.

As highlighted in https://bugs.ruby-lang.org/issues/19288, heavy use of
fstrings (frozen interned strings) can significantly reduce the
parallelism of Ractors.

I tried a few other approaches first: using an RWLock, striping a series
of RWlocks (partitioning the hash N-ways to reduce lock contention), and
putting a cache in front of it. All of these improved the situation, but
were unsatisfying as all still required locks for writes (and granular
locks are awkward, since we run the risk of needing to reach a vm
barrier) and this table is somewhat write-heavy.

My main reference for this was Cliff Click's talk on a lock free
hash-table for java https://www.youtube.com/watch?v=HJ-719EGIts. It
turns out this lock-free hash set is made easier to implement by a few
properties:

 * We only need a hash set rather than a hash table (we only need keys,
   not values), and so the full entry can be written as a single VALUE
 * As a set we only need lookup/insert/delete, no update
 * Delete is only run inside GC so does not need to be atomic (It could
   be made concurrent)
 * I use rb_vm_barrier for the (rare) table rebuilds (It could be made
   concurrent) We VM lock (but don't require other threads to stop) for
   table rebuilds, as those are rare
 * The conservative garbage collector makes deferred replication easy,
   using a T_DATA object

Another benefits of having a table specific to fstrings is that we
compare by value on lookup/insert, but by identity on delete, as we only
want to remove the exact string which is being freed. This is faster and
provides a second way to avoid the race condition in
https://bugs.ruby-lang.org/issues/21172.

This is a pretty standard open-addressing hash table with quadratic
probing. Similar to our existing st_table or id_table. Deletes (which
happen on GC) replace existing keys with a tombstone, which is the only
type of update which can occur. Tombstones are only cleared out on
resize.

Unlike st_table, the VALUEs are stored in the hash table itself
(st_table's bins) rather than as a compact index. This avoids an extra
pointer dereference and is possible because we don't need to preserve
insertion order. The table targets a load factor of 2 (it is enlarged
once it is half full).
2025-04-18 13:03:54 +09:00

203 lines
6.7 KiB
C

#ifndef INTERNAL_STRING_H /*-*-C-*-vi:se ft=c:*/
#define INTERNAL_STRING_H
/**
* @author Ruby developers <ruby-core@ruby-lang.org>
* @copyright This file is a part of the programming language Ruby.
* Permission is hereby granted, to either redistribute and/or
* modify this file, provided that the conditions mentioned in the
* file COPYING are met. Consult the file for details.
* @brief Internal header for String.
*/
#include "ruby/internal/config.h"
#include <stddef.h> /* for size_t */
#include "internal/compilers.h" /* for __has_builtin */
#include "ruby/internal/stdbool.h" /* for bool */
#include "ruby/encoding.h" /* for rb_encoding */
#include "ruby/ruby.h" /* for VALUE */
#define STR_SHARED FL_USER0 /* = ELTS_SHARED */
#define STR_NOEMBED FL_USER1
#define STR_CHILLED (FL_USER2 | FL_USER3)
#define STR_CHILLED_LITERAL FL_USER2
#define STR_CHILLED_SYMBOL_TO_S FL_USER3
enum ruby_rstring_private_flags {
RSTRING_CHILLED = STR_CHILLED,
};
#ifdef rb_fstring_cstr
# undef rb_fstring_cstr
#endif
/* string.c */
VALUE rb_fstring(VALUE);
VALUE rb_fstring_cstr(const char *str);
VALUE rb_fstring_enc_new(const char *ptr, long len, rb_encoding *enc);
int rb_str_buf_cat_escaped_char(VALUE result, unsigned int c, int unicode_p);
int rb_str_symname_p(VALUE);
VALUE rb_str_quote_unprintable(VALUE);
char *rb_str_fill_terminator(VALUE str, const int termlen);
void rb_str_change_terminator_length(VALUE str, const int oldtermlen, const int termlen);
VALUE rb_str_locktmp_ensure(VALUE str, VALUE (*func)(VALUE), VALUE arg);
VALUE rb_str_chomp_string(VALUE str, VALUE chomp);
VALUE rb_external_str_with_enc(VALUE str, rb_encoding *eenc);
VALUE rb_str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len,
rb_encoding *from, int ecflags, VALUE ecopts);
VALUE rb_enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl);
VALUE rb_str_escape(VALUE str);
size_t rb_str_memsize(VALUE);
char *rb_str_to_cstr(VALUE str);
const char *ruby_escaped_char(int c);
void rb_str_make_independent(VALUE str);
int rb_enc_str_coderange_scan(VALUE str, rb_encoding *enc);
int rb_ascii8bit_appendable_encoding_index(rb_encoding *enc, unsigned int code);
VALUE rb_str_include(VALUE str, VALUE arg);
VALUE rb_str_byte_substr(VALUE str, VALUE beg, VALUE len);
VALUE rb_str_substr_two_fixnums(VALUE str, VALUE beg, VALUE len, int empty);
VALUE rb_str_tmp_frozen_no_embed_acquire(VALUE str);
void rb_str_make_embedded(VALUE);
VALUE rb_str_upto_each(VALUE, VALUE, int, int (*each)(VALUE, VALUE), VALUE);
size_t rb_str_size_as_embedded(VALUE);
bool rb_str_reembeddable_p(VALUE);
VALUE rb_str_upto_endless_each(VALUE, int (*each)(VALUE, VALUE), VALUE);
VALUE rb_str_with_debug_created_info(VALUE, VALUE, int);
VALUE rb_str_frozen_bare_string(VALUE);
/* error.c */
void rb_warn_unchilled_literal(VALUE str);
void rb_warn_unchilled_symbol_to_s(VALUE str);
static inline bool STR_EMBED_P(VALUE str);
static inline bool STR_SHARED_P(VALUE str);
static inline VALUE QUOTE(VALUE v);
static inline VALUE QUOTE_ID(ID v);
static inline bool is_ascii_string(VALUE str);
static inline bool is_broken_string(VALUE str);
static inline VALUE rb_str_eql_internal(const VALUE str1, const VALUE str2);
RUBY_SYMBOL_EXPORT_BEGIN
/* string.c (export) */
VALUE rb_str_tmp_frozen_acquire(VALUE str);
void rb_str_tmp_frozen_release(VALUE str, VALUE tmp);
VALUE rb_setup_fake_str(struct RString *fake_str, const char *name, long len, rb_encoding *enc);
RUBY_SYMBOL_EXPORT_END
VALUE rb_fstring_new(const char *ptr, long len);
void rb_gc_free_fstring(VALUE obj);
bool rb_obj_is_fstring_table(VALUE obj);
void Init_fstring_table();
VALUE rb_obj_as_string_result(VALUE str, VALUE obj);
VALUE rb_str_opt_plus(VALUE x, VALUE y);
VALUE rb_str_concat_literals(size_t num, const VALUE *strary);
VALUE rb_str_eql(VALUE str1, VALUE str2);
VALUE rb_id_quote_unprintable(ID);
VALUE rb_sym_proc_call(ID mid, int argc, const VALUE *argv, int kw_splat, VALUE passed_proc);
VALUE rb_enc_literal_str(const char *ptr, long len, rb_encoding *enc);
struct rb_execution_context_struct;
VALUE rb_ec_str_resurrect(struct rb_execution_context_struct *ec, VALUE str, bool chilled);
#define rb_fstring_lit(str) rb_fstring_new((str), rb_strlen_lit(str))
#define rb_fstring_literal(str) rb_fstring_lit(str)
#define rb_fstring_enc_lit(str, enc) rb_fstring_enc_new((str), rb_strlen_lit(str), (enc))
#define rb_fstring_enc_literal(str, enc) rb_fstring_enc_lit(str, enc)
static inline VALUE
QUOTE(VALUE v)
{
return rb_str_quote_unprintable(v);
}
static inline VALUE
QUOTE_ID(ID i)
{
return rb_id_quote_unprintable(i);
}
static inline bool
STR_EMBED_P(VALUE str)
{
return ! FL_TEST_RAW(str, STR_NOEMBED);
}
static inline bool
STR_SHARED_P(VALUE str)
{
return FL_ALL_RAW(str, STR_NOEMBED | STR_SHARED);
}
static inline bool
CHILLED_STRING_P(VALUE obj)
{
return RB_TYPE_P(obj, T_STRING) && FL_TEST_RAW(obj, STR_CHILLED);
}
static inline void
CHILLED_STRING_MUTATED(VALUE str)
{
VALUE chilled_reason = RB_FL_TEST_RAW(str, STR_CHILLED);
FL_UNSET_RAW(str, STR_CHILLED);
switch (chilled_reason) {
case STR_CHILLED_SYMBOL_TO_S:
rb_warn_unchilled_symbol_to_s(str);
break;
case STR_CHILLED_LITERAL:
rb_warn_unchilled_literal(str);
break;
default:
rb_bug("RString was chilled for multiple reasons");
}
}
static inline bool
is_ascii_string(VALUE str)
{
return rb_enc_str_coderange(str) == ENC_CODERANGE_7BIT;
}
static inline bool
is_broken_string(VALUE str)
{
return rb_enc_str_coderange(str) == ENC_CODERANGE_BROKEN;
}
static inline bool
at_char_boundary(const char *s, const char *p, const char *e, rb_encoding *enc)
{
return rb_enc_left_char_head(s, p, e, enc) == p;
}
static inline bool
at_char_right_boundary(const char *s, const char *p, const char *e, rb_encoding *enc)
{
RUBY_ASSERT(s <= p);
RUBY_ASSERT(p <= e);
return rb_enc_right_char_head(s, p, e, enc) == p;
}
/* expect tail call optimization */
// YJIT needs this function to never allocate and never raise
static inline VALUE
rb_str_eql_internal(const VALUE str1, const VALUE str2)
{
const long len = RSTRING_LEN(str1);
const char *ptr1, *ptr2;
if (len != RSTRING_LEN(str2)) return Qfalse;
if (!rb_str_comparable(str1, str2)) return Qfalse;
if ((ptr1 = RSTRING_PTR(str1)) == (ptr2 = RSTRING_PTR(str2)))
return Qtrue;
if (memcmp(ptr1, ptr2, len) == 0)
return Qtrue;
return Qfalse;
}
#if __has_builtin(__builtin_constant_p)
# define rb_fstring_cstr(str) \
(__builtin_constant_p(str) ? \
rb_fstring_new((str), (long)strlen(str)) : \
(rb_fstring_cstr)(str))
#endif
#endif /* INTERNAL_STRING_H */