Make rb_vm_insns_count a thread local variable

`rb_vm_insns_count` is a global variable used for reporting YJIT
statistics. It is a counter that tallies the number of interpreter
instructions that have been executed, this way we can approximate how
much time we're spending in YJIT compared to the interpreter.

Unfortunately keeping this statistic means that every instruction
executed in the interpreter loop must increment the counter. Normally
this isn't a problem, but in multi-threaded situations (when Ractors are
used), incrementing this counter can become quite costly due to page
caching issues.

Additionally, since there is no locking when incrementing this global,
the count can't really make sense in a multi-threaded environment.

This commit changes `rb_vm_insns_count` to a thread local. That way each
Ractor has it's own copy of the counter and incrementing the counter
becomes quite cheap. Of course this means that in multi-threaded
situations, the value doesn't really make sense (but it didn't make
sense before because of the lack of locking).

The counter is used for YJIT statistics, and since YJIT is basically
disabled when Ractors are in use, I don't think we care about
inaccuracies (for the time being). We can revisit this counter when we
give YJIT multi-threading support, but for the time being this commit
restores multi-threaded performance.

To test this, I used the benchmark in [Bug #20489].

Here is the performance on Ruby 3.2:

```
$ time RUBY_MAX_CPU=12 ./miniruby -v ../test.rb 8 8
ruby 3.2.0 (2022-12-25 revision a528908271) [x86_64-linux]
[0...1, 1...2, 2...3, 3...4, 4...5, 5...6, 6...7, 7...8]
../test.rb:43: warning: Ractor is experimental, and the behavior may change in future versions of Ruby! Also there are many implementation issues.

________________________________________________________
Executed in    2.53 secs    fish           external
   usr time   19.86 secs  370.00 micros   19.86 secs
   sys time    0.02 secs  320.00 micros    0.02 secs
```

We can see the regression in performance on the master branch:

```
$ time RUBY_MAX_CPU=12 ./miniruby -v ../test.rb 8 8
ruby 3.5.0dev (2025-01-10T16:22:26Z master 4a2702dafb) +PRISM [x86_64-linux]
[0...1, 1...2, 2...3, 3...4, 4...5, 5...6, 6...7, 7...8]
../test.rb:43: warning: Ractor is experimental, and the behavior may change in future versions of Ruby! Also there are many implementation issues.

________________________________________________________
Executed in   24.87 secs    fish           external
   usr time  195.55 secs    0.00 micros  195.55 secs
   sys time    0.00 secs  716.00 micros    0.00 secs
```

Here are the stats after this commit:

```
$ time RUBY_MAX_CPU=12 ./miniruby -v ../test.rb 8 8
ruby 3.5.0dev (2025-01-10T20:37:06Z tl 3ef0432779) +PRISM [x86_64-linux]
[0...1, 1...2, 2...3, 3...4, 4...5, 5...6, 6...7, 7...8]
../test.rb:43: warning: Ractor is experimental, and the behavior may change in future versions of Ruby! Also there are many implementation issues.

________________________________________________________
Executed in    2.46 secs    fish           external
   usr time   19.34 secs  381.00 micros   19.34 secs
   sys time    0.01 secs  321.00 micros    0.01 secs
```

[Bug #20489]
This commit is contained in:
Aaron Patterson 2025-01-09 10:21:00 -08:00 committed by Aaron Patterson
parent 039446f601
commit 50c2c4bdde
Notes: git 2025-01-10 21:39:43 +00:00
8 changed files with 14 additions and 10 deletions

View File

@ -81,7 +81,7 @@ struct vm_ifunc *rb_current_ifunc(void);
#if USE_YJIT
/* vm_exec.c */
extern uint64_t rb_vm_insns_count;
extern RB_THREAD_LOCAL_SPECIFIER uint64_t rb_vm_insns_count;
#endif
extern bool rb_free_at_exit;

2
rjit.c
View File

@ -170,7 +170,7 @@ struct rb_rjit_runtime_counters rb_rjit_counters = { 0 };
extern VALUE rb_gc_enable(void);
extern VALUE rb_gc_disable(void);
extern uint64_t rb_vm_insns_count;
extern RB_THREAD_LOCAL_SPECIFIER uint64_t rb_vm_insns_count;
// Disable GC, TracePoint, JIT, stats, and $!
#define WITH_RJIT_ISOLATED_USING_PC(using_pc, stmt) do { \

View File

@ -541,7 +541,7 @@ extern VALUE rb_vm_set_ivar_id(VALUE obj, ID id, VALUE val);
extern VALUE rb_ary_unshift_m(int argc, VALUE *argv, VALUE ary);
extern void* rb_rjit_entry_stub_hit(VALUE branch_stub);
extern void* rb_rjit_branch_stub_hit(VALUE branch_stub, int sp_offset, int target0_p);
extern uint64_t rb_vm_insns_count;
extern RB_THREAD_LOCAL_SPECIFIER uint64_t rb_vm_insns_count;
#include "rjit_c.rbinc"

View File

@ -13,7 +13,7 @@
#if USE_YJIT || USE_RJIT
// The number of instructions executed on vm_exec_core. --yjit-stats uses this.
uint64_t rb_vm_insns_count = 0;
RB_THREAD_LOCAL_SPECIFIER uint64_t rb_vm_insns_count = 0;
#endif
#if VM_COLLECT_USAGE_DETAILS

6
yjit.c
View File

@ -16,6 +16,7 @@
#include "internal/fixnum.h"
#include "internal/numeric.h"
#include "internal/gc.h"
#include "internal/vm.h"
#include "vm_core.h"
#include "vm_callinfo.h"
#include "builtin.h"
@ -96,6 +97,11 @@ rb_yjit_mark_executable(void *mem_block, uint32_t mem_size)
}
}
uint64_t
rb_yjit_vm_insns_count(void) {
return rb_vm_insns_count;
}
// Free the specified memory block.
bool
rb_yjit_mark_unused(void *mem_block, uint32_t mem_size)

View File

@ -317,6 +317,7 @@ fn main() {
.allowlist_function("rb_yjit_get_page_size")
.allowlist_function("rb_yjit_iseq_builtin_attrs")
.allowlist_function("rb_yjit_iseq_inspect")
.allowlist_function("rb_yjit_vm_insns_count")
.allowlist_function("rb_yjit_builtin_function")
.allowlist_function("rb_set_cfp_(pc|sp)")
.allowlist_function("rb_yjit_multi_ractor_p")
@ -380,9 +381,6 @@ fn main() {
.allowlist_function("rb_ivar_get")
.allowlist_function("rb_mod_name")
// From internal/vm.h
.allowlist_var("rb_vm_insns_count")
// From include/ruby/internal/intern/vm.h
.allowlist_function("rb_get_alloc_func")

View File

@ -1061,7 +1061,6 @@ extern "C" {
elts: *const VALUE,
) -> VALUE;
pub fn rb_vm_top_self() -> VALUE;
pub static mut rb_vm_insns_count: u64;
pub fn rb_method_entry_at(obj: VALUE, id: ID) -> *const rb_method_entry_t;
pub fn rb_callable_method_entry(klass: VALUE, id: ID) -> *const rb_callable_method_entry_t;
pub fn rb_callable_method_entry_or_negative(
@ -1141,6 +1140,7 @@ extern "C" {
pub fn rb_jit_cont_each_iseq(callback: rb_iseq_callback, data: *mut ::std::os::raw::c_void);
pub fn rb_yjit_mark_writable(mem_block: *mut ::std::os::raw::c_void, mem_size: u32) -> bool;
pub fn rb_yjit_mark_executable(mem_block: *mut ::std::os::raw::c_void, mem_size: u32);
pub fn rb_yjit_vm_insns_count() -> u64;
pub fn rb_yjit_mark_unused(mem_block: *mut ::std::os::raw::c_void, mem_size: u32) -> bool;
pub fn rb_yjit_array_len(a: VALUE) -> ::std::os::raw::c_long;
pub fn rb_yjit_icache_invalidate(

View File

@ -792,7 +792,7 @@ fn rb_yjit_gen_stats_dict(key: VALUE) -> VALUE {
set_stat_usize!(hash, "context_cache_bytes", crate::core::CTX_ENCODE_CACHE_BYTES + crate::core::CTX_DECODE_CACHE_BYTES);
// VM instructions count
set_stat_usize!(hash, "vm_insns_count", rb_vm_insns_count as usize);
set_stat_usize!(hash, "vm_insns_count", rb_yjit_vm_insns_count() as usize);
set_stat_usize!(hash, "live_iseq_count", rb_yjit_live_iseq_count as usize);
set_stat_usize!(hash, "iseq_alloc_count", rb_yjit_iseq_alloc_count as usize);
@ -862,7 +862,7 @@ fn rb_yjit_gen_stats_dict(key: VALUE) -> VALUE {
set_stat_double!(hash, "avg_len_in_yjit", avg_len_in_yjit);
// Proportion of instructions that retire in YJIT
let total_insns_count = retired_in_yjit + rb_vm_insns_count;
let total_insns_count = retired_in_yjit + rb_yjit_vm_insns_count();
set_stat_usize!(hash, "total_insns_count", total_insns_count as usize);
let ratio_in_yjit: f64 = 100.0 * retired_in_yjit as f64 / total_insns_count as f64;