ruby/vm_exec.c
Aaron Patterson 50c2c4bdde Make rb_vm_insns_count a thread local variable
`rb_vm_insns_count` is a global variable used for reporting YJIT
statistics. It is a counter that tallies the number of interpreter
instructions that have been executed, this way we can approximate how
much time we're spending in YJIT compared to the interpreter.

Unfortunately keeping this statistic means that every instruction
executed in the interpreter loop must increment the counter. Normally
this isn't a problem, but in multi-threaded situations (when Ractors are
used), incrementing this counter can become quite costly due to page
caching issues.

Additionally, since there is no locking when incrementing this global,
the count can't really make sense in a multi-threaded environment.

This commit changes `rb_vm_insns_count` to a thread local. That way each
Ractor has it's own copy of the counter and incrementing the counter
becomes quite cheap. Of course this means that in multi-threaded
situations, the value doesn't really make sense (but it didn't make
sense before because of the lack of locking).

The counter is used for YJIT statistics, and since YJIT is basically
disabled when Ractors are in use, I don't think we care about
inaccuracies (for the time being). We can revisit this counter when we
give YJIT multi-threading support, but for the time being this commit
restores multi-threaded performance.

To test this, I used the benchmark in [Bug #20489].

Here is the performance on Ruby 3.2:

```
$ time RUBY_MAX_CPU=12 ./miniruby -v ../test.rb 8 8
ruby 3.2.0 (2022-12-25 revision a528908271) [x86_64-linux]
[0...1, 1...2, 2...3, 3...4, 4...5, 5...6, 6...7, 7...8]
../test.rb:43: warning: Ractor is experimental, and the behavior may change in future versions of Ruby! Also there are many implementation issues.

________________________________________________________
Executed in    2.53 secs    fish           external
   usr time   19.86 secs  370.00 micros   19.86 secs
   sys time    0.02 secs  320.00 micros    0.02 secs
```

We can see the regression in performance on the master branch:

```
$ time RUBY_MAX_CPU=12 ./miniruby -v ../test.rb 8 8
ruby 3.5.0dev (2025-01-10T16:22:26Z master 4a2702dafb) +PRISM [x86_64-linux]
[0...1, 1...2, 2...3, 3...4, 4...5, 5...6, 6...7, 7...8]
../test.rb:43: warning: Ractor is experimental, and the behavior may change in future versions of Ruby! Also there are many implementation issues.

________________________________________________________
Executed in   24.87 secs    fish           external
   usr time  195.55 secs    0.00 micros  195.55 secs
   sys time    0.00 secs  716.00 micros    0.00 secs
```

Here are the stats after this commit:

```
$ time RUBY_MAX_CPU=12 ./miniruby -v ../test.rb 8 8
ruby 3.5.0dev (2025-01-10T20:37:06Z tl 3ef0432779) +PRISM [x86_64-linux]
[0...1, 1...2, 2...3, 3...4, 4...5, 5...6, 6...7, 7...8]
../test.rb:43: warning: Ractor is experimental, and the behavior may change in future versions of Ruby! Also there are many implementation issues.

________________________________________________________
Executed in    2.46 secs    fish           external
   usr time   19.34 secs  381.00 micros   19.34 secs
   sys time    0.01 secs  321.00 micros    0.01 secs
```

[Bug #20489]
2025-01-10 13:39:21 -08:00

155 lines
3.5 KiB
C

/* -*-c-*- */
/**********************************************************************
vm_exec.c -
$Author$
Copyright (C) 2004-2007 Koichi Sasada
**********************************************************************/
#include <math.h>
#if USE_YJIT || USE_RJIT
// The number of instructions executed on vm_exec_core. --yjit-stats uses this.
RB_THREAD_LOCAL_SPECIFIER uint64_t rb_vm_insns_count = 0;
#endif
#if VM_COLLECT_USAGE_DETAILS
static void vm_analysis_insn(int insn);
#endif
#if VMDEBUG > 0
#define DECL_SC_REG(type, r, reg) register type reg_##r
#elif defined(__GNUC__) && defined(__x86_64__)
#define DECL_SC_REG(type, r, reg) register type reg_##r __asm__("r" reg)
#elif defined(__GNUC__) && defined(__i386__)
#define DECL_SC_REG(type, r, reg) register type reg_##r __asm__("e" reg)
#elif defined(__GNUC__) && (defined(__powerpc64__) || defined(__POWERPC__))
#define DECL_SC_REG(type, r, reg) register type reg_##r __asm__("r" reg)
#elif defined(__GNUC__) && defined(__aarch64__)
#define DECL_SC_REG(type, r, reg) register type reg_##r __asm__("x" reg)
#else
#define DECL_SC_REG(type, r, reg) register type reg_##r
#endif
/* #define DECL_SC_REG(r, reg) VALUE reg_##r */
#if !OPT_CALL_THREADED_CODE
static VALUE
vm_exec_core(rb_execution_context_t *ec)
{
#if defined(__GNUC__) && defined(__i386__)
DECL_SC_REG(const VALUE *, pc, "di");
DECL_SC_REG(rb_control_frame_t *, cfp, "si");
#define USE_MACHINE_REGS 1
#elif defined(__GNUC__) && defined(__x86_64__)
DECL_SC_REG(const VALUE *, pc, "14");
DECL_SC_REG(rb_control_frame_t *, cfp, "15");
#define USE_MACHINE_REGS 1
#elif defined(__GNUC__) && (defined(__powerpc64__) || defined(__POWERPC__))
DECL_SC_REG(const VALUE *, pc, "14");
DECL_SC_REG(rb_control_frame_t *, cfp, "15");
#define USE_MACHINE_REGS 1
#elif defined(__GNUC__) && defined(__aarch64__)
DECL_SC_REG(const VALUE *, pc, "19");
DECL_SC_REG(rb_control_frame_t *, cfp, "20");
#define USE_MACHINE_REGS 1
#else
register rb_control_frame_t *reg_cfp;
const VALUE *reg_pc;
#define USE_MACHINE_REGS 0
#endif
#if USE_MACHINE_REGS
#undef RESTORE_REGS
#define RESTORE_REGS() \
{ \
VM_REG_CFP = ec->cfp; \
reg_pc = reg_cfp->pc; \
}
#undef VM_REG_PC
#define VM_REG_PC reg_pc
#undef GET_PC
#define GET_PC() (reg_pc)
#undef SET_PC
#define SET_PC(x) (reg_cfp->pc = VM_REG_PC = (x))
#endif
#if OPT_TOKEN_THREADED_CODE || OPT_DIRECT_THREADED_CODE
#include "vmtc.inc"
if (UNLIKELY(ec == 0)) {
return (VALUE)insns_address_table;
}
#endif
reg_cfp = ec->cfp;
reg_pc = reg_cfp->pc;
first:
INSN_DISPATCH();
/*****************/
#include "vm.inc"
/*****************/
END_INSNS_DISPATCH();
/* unreachable */
rb_bug("vm_eval: unreachable");
goto first;
}
const void **
rb_vm_get_insns_address_table(void)
{
return (const void **)vm_exec_core(0);
}
#else /* OPT_CALL_THREADED_CODE */
#include "vm.inc"
#include "vmtc.inc"
const void **
rb_vm_get_insns_address_table(void)
{
return (const void **)insns_address_table;
}
static VALUE
vm_exec_core(rb_execution_context_t *ec)
{
register rb_control_frame_t *reg_cfp = ec->cfp;
rb_thread_t *th;
while (1) {
reg_cfp = ((rb_insn_func_t) (*GET_PC()))(ec, reg_cfp);
if (UNLIKELY(reg_cfp == 0)) {
break;
}
}
if (!UNDEF_P((th = rb_ec_thread_ptr(ec))->retval)) {
VALUE ret = th->retval;
th->retval = Qundef;
return ret;
}
else {
VALUE err = ec->errinfo;
ec->errinfo = Qnil;
return err;
}
}
#endif