From 5b129c899a5cf3bf8253eaaf7fbc8331b1e55f75 Mon Sep 17 00:00:00 2001 From: Takashi Kokubun Date: Tue, 27 Aug 2024 17:04:43 -0700 Subject: [PATCH] YJIT: Pass method arguments using registers (#11280) * YJIT: Pass method arguments using registers * s/at_current_insn/at_compile_target/ * Implement register shuffle --- bootstraptest/test_yjit.rb | 28 +++++++ yjit/src/backend/ir.rs | 166 +++++++++++++++++++++++-------------- yjit/src/codegen.rs | 154 ++++++++++++++++++++++++---------- yjit/src/core.rs | 106 +++++++++++++++++++---- 4 files changed, 330 insertions(+), 124 deletions(-) diff --git a/bootstraptest/test_yjit.rb b/bootstraptest/test_yjit.rb index eccaa542c8..df1eba1ce1 100644 --- a/bootstraptest/test_yjit.rb +++ b/bootstraptest/test_yjit.rb @@ -2256,6 +2256,34 @@ assert_equal '7', %q{ foo(5,2) } +# regression test for argument registers with invalidation +assert_equal '[0, 1, 2]', %q{ + def test(n) + ret = n + binding + ret + end + + [0, 1, 2].map do |n| + test(n) + end +} + +# regression test for argument registers +assert_equal 'true', %q{ + class Foo + def ==(other) + other == nil + end + end + + def test + [Foo.new].include?(Foo.new) + end + + test +} + # test pattern matching assert_equal '[:ok, :ok]', %q{ class C diff --git a/yjit/src/backend/ir.rs b/yjit/src/backend/ir.rs index 3355dc5da3..eb32dac987 100644 --- a/yjit/src/backend/ir.rs +++ b/yjit/src/backend/ir.rs @@ -1032,7 +1032,7 @@ pub struct Assembler { pub ctx: Context, /// The current ISEQ's local table size. asm.local_opnd() uses this, and it's - /// sometimes hard to pass this value, e.g. asm.spill_temps() in asm.ccall(). + /// sometimes hard to pass this value, e.g. asm.spill_regs() in asm.ccall(). /// /// `None` means we're not assembling for an ISEQ, or that the local size is /// not relevant. @@ -1241,8 +1241,36 @@ impl Assembler self.ctx.clear_local_types(); } + /// Repurpose stack temp registers to the corresponding locals for arguments + pub fn map_temp_regs_to_args(&mut self, callee_ctx: &mut Context, argc: i32) -> Vec { + let mut callee_reg_mapping = callee_ctx.get_reg_mapping(); + let mut mapped_temps = vec![]; + + for arg_idx in 0..argc { + let stack_idx: u8 = (self.ctx.get_stack_size() as i32 - argc + arg_idx).try_into().unwrap(); + let temp_opnd = RegOpnd::Stack(stack_idx); + + // For each argument, if the stack temp for it has a register, + // let the callee use the register for the local variable. + if let Some(reg_idx) = self.ctx.get_reg_mapping().get_reg(temp_opnd) { + let local_opnd = RegOpnd::Local(arg_idx.try_into().unwrap()); + callee_reg_mapping.set_reg(local_opnd, reg_idx); + mapped_temps.push(temp_opnd); + } + } + + asm_comment!(self, "local maps: {:?}", callee_reg_mapping); + callee_ctx.set_reg_mapping(callee_reg_mapping); + mapped_temps + } + /// Spill all live registers to the stack pub fn spill_regs(&mut self) { + self.spill_regs_except(&vec![]); + } + + /// Spill all live registers except `ignored_temps` to the stack + pub fn spill_regs_except(&mut self, ignored_temps: &Vec) { // Forget registers above the stack top let mut reg_mapping = self.ctx.get_reg_mapping(); for stack_idx in self.ctx.get_stack_size()..MAX_CTX_TEMPS as u8 { @@ -1250,36 +1278,46 @@ impl Assembler } self.set_reg_mapping(reg_mapping); - // Spill live stack temps - if self.ctx.get_reg_mapping() != RegMapping::default() { - asm_comment!(self, "spill_temps: {:?} -> {:?}", self.ctx.get_reg_mapping(), RegMapping::default()); - - // Spill stack temps - for stack_idx in 0..u8::min(MAX_CTX_TEMPS as u8, self.ctx.get_stack_size()) { - if reg_mapping.dealloc_reg(RegOpnd::Stack(stack_idx)) { - let idx = self.ctx.get_stack_size() - 1 - stack_idx; - self.spill_temp(self.stack_opnd(idx.into())); - } - } - - // Spill locals - for local_idx in 0..MAX_CTX_TEMPS as u8 { - if reg_mapping.dealloc_reg(RegOpnd::Local(local_idx)) { - let first_local_ep_offset = self.num_locals.unwrap() + VM_ENV_DATA_SIZE - 1; - let ep_offset = first_local_ep_offset - local_idx as u32; - self.spill_temp(self.local_opnd(ep_offset)); - } - } - - self.ctx.set_reg_mapping(reg_mapping); + // If no registers are in use, skip all checks + if self.ctx.get_reg_mapping() == RegMapping::default() { + return; } - // Every stack temp should have been spilled - assert_eq!(self.ctx.get_reg_mapping(), RegMapping::default()); + // Collect stack temps to be spilled + let mut spilled_opnds = vec![]; + for stack_idx in 0..u8::min(MAX_CTX_TEMPS as u8, self.ctx.get_stack_size()) { + let reg_opnd = RegOpnd::Stack(stack_idx); + if !ignored_temps.contains(®_opnd) && reg_mapping.dealloc_reg(reg_opnd) { + let idx = self.ctx.get_stack_size() - 1 - stack_idx; + let spilled_opnd = self.stack_opnd(idx.into()); + spilled_opnds.push(spilled_opnd); + reg_mapping.dealloc_reg(spilled_opnd.reg_opnd()); + } + } + + // Collect locals to be spilled + for local_idx in 0..MAX_CTX_TEMPS as u8 { + if reg_mapping.dealloc_reg(RegOpnd::Local(local_idx)) { + let first_local_ep_offset = self.num_locals.unwrap() + VM_ENV_DATA_SIZE - 1; + let ep_offset = first_local_ep_offset - local_idx as u32; + let spilled_opnd = self.local_opnd(ep_offset); + spilled_opnds.push(spilled_opnd); + reg_mapping.dealloc_reg(spilled_opnd.reg_opnd()); + } + } + + // Spill stack temps and locals + if !spilled_opnds.is_empty() { + asm_comment!(self, "spill_regs: {:?} -> {:?}", self.ctx.get_reg_mapping(), reg_mapping); + for &spilled_opnd in spilled_opnds.iter() { + self.spill_reg(spilled_opnd); + } + self.ctx.set_reg_mapping(reg_mapping); + } } /// Spill a stack temp from a register to the stack - fn spill_temp(&mut self, opnd: Opnd) { + fn spill_reg(&mut self, opnd: Opnd) { assert_ne!(self.ctx.get_reg_mapping().get_reg(opnd.reg_opnd()), None); // Use different RegMappings for dest and src operands @@ -1308,6 +1346,42 @@ impl Assembler } } + // Shuffle register moves, sometimes adding extra moves using SCRATCH_REG, + // so that they will not rewrite each other before they are used. + pub fn reorder_reg_moves(old_moves: &Vec<(Reg, Opnd)>) -> Vec<(Reg, Opnd)> { + // Return the index of a move whose destination is not used as a source if any. + fn find_safe_move(moves: &Vec<(Reg, Opnd)>) -> Option { + moves.iter().enumerate().find(|(_, &(dest_reg, _))| { + moves.iter().all(|&(_, src_opnd)| src_opnd != Opnd::Reg(dest_reg)) + }).map(|(index, _)| index) + } + + // Remove moves whose source and destination are the same + let mut old_moves: Vec<(Reg, Opnd)> = old_moves.clone().into_iter() + .filter(|&(reg, opnd)| Opnd::Reg(reg) != opnd).collect(); + + let mut new_moves = vec![]; + while old_moves.len() > 0 { + // Keep taking safe moves + while let Some(index) = find_safe_move(&old_moves) { + new_moves.push(old_moves.remove(index)); + } + + // No safe move. Load the source of one move into SCRATCH_REG, and + // then load SCRATCH_REG into the destination when it's safe. + if old_moves.len() > 0 { + // Make sure it's safe to use SCRATCH_REG + assert!(old_moves.iter().all(|&(_, opnd)| opnd != Opnd::Reg(Assembler::SCRATCH_REG))); + + // Move SCRATCH <- opnd, and delay reg <- SCRATCH + let (reg, opnd) = old_moves.remove(0); + new_moves.push((Assembler::SCRATCH_REG, opnd)); + old_moves.push((reg, Opnd::Reg(Assembler::SCRATCH_REG))); + } + } + new_moves + } + /// Sets the out field on the various instructions that require allocated /// registers because their output is used as the operand on a subsequent /// instruction. This is our implementation of the linear scan algorithm. @@ -1353,42 +1427,6 @@ impl Assembler } } - // Reorder C argument moves, sometimes adding extra moves using SCRATCH_REG, - // so that they will not rewrite each other before they are used. - fn reorder_c_args(c_args: &Vec<(Reg, Opnd)>) -> Vec<(Reg, Opnd)> { - // Return the index of a move whose destination is not used as a source if any. - fn find_safe_arg(c_args: &Vec<(Reg, Opnd)>) -> Option { - c_args.iter().enumerate().find(|(_, &(dest_reg, _))| { - c_args.iter().all(|&(_, src_opnd)| src_opnd != Opnd::Reg(dest_reg)) - }).map(|(index, _)| index) - } - - // Remove moves whose source and destination are the same - let mut c_args: Vec<(Reg, Opnd)> = c_args.clone().into_iter() - .filter(|&(reg, opnd)| Opnd::Reg(reg) != opnd).collect(); - - let mut moves = vec![]; - while c_args.len() > 0 { - // Keep taking safe moves - while let Some(index) = find_safe_arg(&c_args) { - moves.push(c_args.remove(index)); - } - - // No safe move. Load the source of one move into SCRATCH_REG, and - // then load SCRATCH_REG into the destination when it's safe. - if c_args.len() > 0 { - // Make sure it's safe to use SCRATCH_REG - assert!(c_args.iter().all(|&(_, opnd)| opnd != Opnd::Reg(Assembler::SCRATCH_REG))); - - // Move SCRATCH <- opnd, and delay reg <- SCRATCH - let (reg, opnd) = c_args.remove(0); - moves.push((Assembler::SCRATCH_REG, opnd)); - c_args.push((reg, Opnd::Reg(Assembler::SCRATCH_REG))); - } - } - moves - } - // Adjust the number of entries in live_ranges so that it can be indexed by mapped indexes. fn shift_live_ranges(live_ranges: &mut Vec, start_index: usize, shift_offset: isize) { if shift_offset >= 0 { @@ -1564,7 +1602,7 @@ impl Assembler if c_args.len() > 0 { // Resolve C argument dependencies let c_args_len = c_args.len() as isize; - let moves = reorder_c_args(&c_args.drain(..).into_iter().collect()); + let moves = Self::reorder_reg_moves(&c_args.drain(..).into_iter().collect()); shift_live_ranges(&mut shifted_live_ranges, asm.insns.len(), moves.len() as isize - c_args_len); // Push batched C arguments @@ -1808,7 +1846,7 @@ impl Assembler { // Mark all temps as not being in registers. // Temps will be marked back as being in registers by cpop_all. // We assume that cpush_all + cpop_all are used for C functions in utils.rs - // that don't require spill_temps for GC. + // that don't require spill_regs for GC. self.set_reg_mapping(RegMapping::default()); } diff --git a/yjit/src/codegen.rs b/yjit/src/codegen.rs index 458777b5e3..493e61c28b 100644 --- a/yjit/src/codegen.rs +++ b/yjit/src/codegen.rs @@ -3,6 +3,7 @@ use crate::asm::*; use crate::backend::ir::*; +use crate::backend::current::TEMP_REGS; use crate::core::*; use crate::cruby::*; use crate::invariants::*; @@ -114,10 +115,13 @@ pub struct JITState<'a> { /// Stack of symbol names for --yjit-perf perf_stack: Vec, + + /// When true, this block is the first block compiled by gen_block_series(). + first_block: bool, } impl<'a> JITState<'a> { - pub fn new(blockid: BlockId, starting_ctx: Context, output_ptr: CodePtr, ec: EcPtr, ocb: &'a mut OutlinedCb) -> Self { + pub fn new(blockid: BlockId, starting_ctx: Context, output_ptr: CodePtr, ec: EcPtr, ocb: &'a mut OutlinedCb, first_block: bool) -> Self { JITState { iseq: blockid.iseq, starting_insn_idx: blockid.idx, @@ -140,6 +144,7 @@ impl<'a> JITState<'a> { block_assumes_single_ractor: false, perf_map: Rc::default(), perf_stack: vec![], + first_block, } } @@ -212,9 +217,16 @@ impl<'a> JITState<'a> { self.next_insn_idx() + insn_len(next_opcode) as u16 } - // Check if we are compiling the instruction at the stub PC + // Check if we are compiling the instruction at the stub PC with the target Context // Meaning we are compiling the instruction that is next to execute - pub fn at_current_insn(&self) -> bool { + pub fn at_compile_target(&self) -> bool { + // If this is not the first block compiled by gen_block_series(), + // it might be compiling the same block again with a different Context. + // In that case, it should defer_compilation() and inspect the stack there. + if !self.first_block { + return false; + } + let ec_pc: *mut VALUE = unsafe { get_cfp_pc(self.get_cfp()) }; ec_pc == self.pc } @@ -222,7 +234,7 @@ impl<'a> JITState<'a> { // Peek at the nth topmost value on the Ruby stack. // Returns the topmost value when n == 0. pub fn peek_at_stack(&self, ctx: &Context, n: isize) -> VALUE { - assert!(self.at_current_insn()); + assert!(self.at_compile_target()); assert!(n < ctx.get_stack_size() as isize); // Note: this does not account for ctx->sp_offset because @@ -241,7 +253,7 @@ impl<'a> JITState<'a> { } fn peek_at_local(&self, n: i32) -> VALUE { - assert!(self.at_current_insn()); + assert!(self.at_compile_target()); let local_table_size: isize = unsafe { get_iseq_body_local_table_size(self.iseq) } .try_into() @@ -257,7 +269,7 @@ impl<'a> JITState<'a> { } fn peek_at_block_handler(&self, level: u32) -> VALUE { - assert!(self.at_current_insn()); + assert!(self.at_compile_target()); unsafe { let ep = get_cfp_ep_level(self.get_cfp(), level); @@ -656,7 +668,7 @@ fn verify_ctx(jit: &JITState, ctx: &Context) { } // Only able to check types when at current insn - assert!(jit.at_current_insn()); + assert!(jit.at_compile_target()); let self_val = jit.peek_at_self(); let self_val_type = Type::from(self_val); @@ -1172,6 +1184,7 @@ pub fn gen_single_block( ec: EcPtr, cb: &mut CodeBlock, ocb: &mut OutlinedCb, + first_block: bool, ) -> Result { // Limit the number of specialized versions for this block let ctx = limit_block_versions(blockid, start_ctx); @@ -1195,7 +1208,7 @@ pub fn gen_single_block( let mut insn_idx: IseqIdx = blockid.idx; // Initialize a JIT state object - let mut jit = JITState::new(blockid, ctx, cb.get_write_ptr(), ec, ocb); + let mut jit = JITState::new(blockid, ctx, cb.get_write_ptr(), ec, ocb, first_block); jit.iseq = blockid.iseq; // Create a backend assembler instance @@ -1265,7 +1278,7 @@ pub fn gen_single_block( } // In debug mode, verify our existing assumption - if cfg!(debug_assertions) && get_option!(verify_ctx) && jit.at_current_insn() { + if cfg!(debug_assertions) && get_option!(verify_ctx) && jit.at_compile_target() { verify_ctx(&jit, &asm.ctx); } @@ -1508,7 +1521,7 @@ fn fuse_putobject_opt_ltlt( if shift_amt > 63 || shift_amt < 0 { return None; } - if !jit.at_current_insn() { + if !jit.at_compile_target() { defer_compilation(jit, asm); return Some(EndBlock); } @@ -1772,7 +1785,7 @@ fn gen_splatkw( asm: &mut Assembler, ) -> Option { // Defer compilation so we can specialize on a runtime hash operand - if !jit.at_current_insn() { + if !jit.at_compile_target() { defer_compilation(jit, asm); return Some(EndBlock); } @@ -2146,7 +2159,7 @@ fn gen_expandarray( let array_opnd = asm.stack_opnd(0); // Defer compilation so we can specialize on a runtime `self` - if !jit.at_current_insn() { + if !jit.at_compile_target() { defer_compilation(jit, asm); return Some(EndBlock); } @@ -2345,7 +2358,16 @@ fn gen_getlocal_generic( // Load the local from the block // val = *(vm_get_ep(GET_EP(), level) - idx); let offs = -(SIZEOF_VALUE_I32 * ep_offset as i32); - Opnd::mem(64, ep_opnd, offs) + let local_opnd = Opnd::mem(64, ep_opnd, offs); + + // Write back an argument register to the stack. If the local variable + // is an argument, it might have an allocated register, but if this ISEQ + // is known to escape EP, the register shouldn't be used after this getlocal. + if level == 0 && asm.ctx.get_reg_mapping().get_reg(asm.local_opnd(ep_offset).reg_opnd()).is_some() { + asm.mov(local_opnd, asm.local_opnd(ep_offset)); + } + + local_opnd }; // Write the local at SP @@ -2425,6 +2447,13 @@ fn gen_setlocal_generic( asm.alloc_reg(local_opnd.reg_opnd()); (flags_opnd, local_opnd) } else { + // Make sure getlocal doesn't read a stale register. If the local variable + // is an argument, it might have an allocated register, but if this ISEQ + // is known to escape EP, the register shouldn't be used after this setlocal. + if level == 0 { + asm.ctx.dealloc_reg(asm.local_opnd(ep_offset).reg_opnd()); + } + // Load flags and the local for the level let ep_opnd = gen_get_ep(asm, level); let flags_opnd = Opnd::mem( @@ -2627,11 +2656,11 @@ fn gen_checkkeyword( // The index of the keyword we want to check let index: i64 = jit.get_arg(1).as_i64(); - // Load environment pointer EP - let ep_opnd = gen_get_ep(asm, 0); - - // VALUE kw_bits = *(ep - bits); - let bits_opnd = Opnd::mem(64, ep_opnd, SIZEOF_VALUE_I32 * -bits_offset); + // `unspecified_bits` is a part of the local table. Therefore, we may allocate a register for + // that "local" when passing it as an argument. We must use such a register to avoid loading + // random bits from the stack if any. We assume that EP is not escaped as of entering a method + // with keyword arguments. + let bits_opnd = asm.local_opnd(bits_offset as u32); // unsigned int b = (unsigned int)FIX2ULONG(kw_bits); // if ((b & (0x01 << idx))) { @@ -2846,7 +2875,7 @@ fn gen_getinstancevariable( asm: &mut Assembler, ) -> Option { // Defer compilation so we can specialize on a runtime `self` - if !jit.at_current_insn() { + if !jit.at_compile_target() { defer_compilation(jit, asm); return Some(EndBlock); } @@ -2910,7 +2939,7 @@ fn gen_setinstancevariable( asm: &mut Assembler, ) -> Option { // Defer compilation so we can specialize on a runtime `self` - if !jit.at_current_insn() { + if !jit.at_compile_target() { defer_compilation(jit, asm); return Some(EndBlock); } @@ -3221,7 +3250,7 @@ fn gen_definedivar( asm: &mut Assembler, ) -> Option { // Defer compilation so we can specialize base on a runtime receiver - if !jit.at_current_insn() { + if !jit.at_compile_target() { defer_compilation(jit, asm); return Some(EndBlock); } @@ -3550,7 +3579,7 @@ fn gen_equality_specialized( return Some(true); } - if !jit.at_current_insn() { + if !jit.at_compile_target() { return None; } let comptime_a = jit.peek_at_stack(&asm.ctx, 1); @@ -3669,7 +3698,7 @@ fn gen_opt_aref( } // Defer compilation so we can specialize base on a runtime receiver - if !jit.at_current_insn() { + if !jit.at_compile_target() { defer_compilation(jit, asm); return Some(EndBlock); } @@ -3770,7 +3799,7 @@ fn gen_opt_aset( asm: &mut Assembler, ) -> Option { // Defer compilation so we can specialize on a runtime `self` - if !jit.at_current_insn() { + if !jit.at_compile_target() { defer_compilation(jit, asm); return Some(EndBlock); } @@ -4376,7 +4405,7 @@ fn gen_opt_case_dispatch( // We'd hope that our jitted code will be sufficiently fast without the // hash lookup, at least for small hashes, but it's worth revisiting this // assumption in the future. - if !jit.at_current_insn() { + if !jit.at_compile_target() { defer_compilation(jit, asm); return Some(EndBlock); } @@ -6433,14 +6462,6 @@ fn gen_push_frame( asm.mov(cfp_opnd(RUBY_OFFSET_CFP_SELF), frame.recv); asm.mov(cfp_opnd(RUBY_OFFSET_CFP_BLOCK_CODE), 0.into()); - if frame.iseq.is_some() { - // Spill stack temps to let the callee use them (must be done before changing the SP register) - asm.spill_regs(); - - // Saving SP before calculating ep avoids a dependency on a register - // However this must be done after referencing frame.recv, which may be SP-relative - asm.mov(SP, sp); - } let ep = asm.sub(sp, SIZEOF_VALUE.into()); asm.mov(cfp_opnd(RUBY_OFFSET_CFP_EP), ep); } @@ -7770,9 +7791,32 @@ fn gen_send_iseq( pc: None, // We are calling into jitted code, which will set the PC as necessary })); + // Create a context for the callee + let mut callee_ctx = Context::default(); + + // Transfer some stack temp registers to the callee's locals for arguments. + let mapped_temps = if !forwarding { + asm.map_temp_regs_to_args(&mut callee_ctx, argc) + } else { + // When forwarding, the callee's local table has only a callinfo, + // so we can't map the actual arguments to the callee's locals. + vec![] + }; + + // Spill stack temps and locals that are not used by the callee. + // This must be done before changing the SP register. + asm.spill_regs_except(&mapped_temps); + + // Saving SP before calculating ep avoids a dependency on a register + // However this must be done after referencing frame.recv, which may be SP-relative + asm.mov(SP, callee_sp); + // Log the name of the method we're calling to. We intentionally don't do this for inlined ISEQs. // We also do this after gen_push_frame() to minimize the impact of spill_temps() on asm.ccall(). if get_option!(gen_stats) { + // Protect caller-saved registers in case they're used for arguments + asm.cpush_all(); + // Assemble the ISEQ name string let name_str = get_iseq_name(iseq); @@ -7781,6 +7825,7 @@ fn gen_send_iseq( // Increment the counter for this cfunc asm.ccall(incr_iseq_counter as *const u8, vec![iseq_idx.into()]); + asm.cpop_all(); } // No need to set cfp->pc since the callee sets it whenever calling into routines @@ -7794,9 +7839,6 @@ fn gen_send_iseq( idx: jit.next_insn_idx(), }; - // Create a context for the callee - let mut callee_ctx = Context::default(); - // If the callee has :inline_block annotation and the callsite has a block ISEQ, // duplicate a callee block for each block ISEQ to make its `yield` monomorphic. if let (Some(BlockHandler::BlockISeq(iseq)), true) = (block, builtin_attrs & BUILTIN_ATTR_INLINE_BLOCK != 0) { @@ -7816,6 +7858,7 @@ fn gen_send_iseq( callee_ctx.set_local_type(0, Type::Unknown) } + // Set the receiver type in the callee's context let recv_type = if captured_self { Type::Unknown // we don't track the type information of captured->self for now } else { @@ -7823,6 +7866,29 @@ fn gen_send_iseq( }; callee_ctx.upgrade_opnd_type(SelfOpnd, recv_type); + // Now that callee_ctx is prepared, discover a block that can be reused if we move some registers. + // If there's such a block, move registers accordingly to avoid creating a new block. + let blockid = BlockId { iseq, idx: start_pc_offset }; + if !mapped_temps.is_empty() { + // Discover a block that have the same things in different (or same) registers + if let Some(block_ctx) = find_block_ctx_with_same_regs(blockid, &callee_ctx) { + // List pairs of moves for making the register mappings compatible + let mut moves = vec![]; + for ®_opnd in callee_ctx.get_reg_mapping().get_reg_opnds().iter() { + let old_reg = TEMP_REGS[callee_ctx.get_reg_mapping().get_reg(reg_opnd).unwrap()]; + let new_reg = TEMP_REGS[block_ctx.get_reg_mapping().get_reg(reg_opnd).unwrap()]; + moves.push((new_reg, Opnd::Reg(old_reg))); + } + + // Shuffle them to break cycles and generate the moves + let moves = Assembler::reorder_reg_moves(&moves); + for (reg, opnd) in moves { + asm.load_into(Opnd::Reg(reg), opnd); + } + callee_ctx.set_reg_mapping(block_ctx.get_reg_mapping()); + } + } + // The callee might change locals through Kernel#binding and other means. asm.clear_local_types(); @@ -7856,10 +7922,7 @@ fn gen_send_iseq( gen_direct_jump( jit, &callee_ctx, - BlockId { - iseq: iseq, - idx: start_pc_offset, - }, + blockid, asm, ); @@ -8541,7 +8604,7 @@ fn gen_send_general( let mut flags = unsafe { vm_ci_flag(ci) }; // Defer compilation so we can specialize on class of receiver - if !jit.at_current_insn() { + if !jit.at_compile_target() { defer_compilation(jit, asm); return Some(EndBlock); } @@ -9102,7 +9165,7 @@ fn gen_invokeblock_specialized( asm: &mut Assembler, cd: *const rb_call_data, ) -> Option { - if !jit.at_current_insn() { + if !jit.at_compile_target() { defer_compilation(jit, asm); return Some(EndBlock); } @@ -9265,7 +9328,7 @@ fn gen_invokesuper_specialized( cd: *const rb_call_data, ) -> Option { // Defer compilation so we can specialize on class of receiver - if !jit.at_current_insn() { + if !jit.at_compile_target() { defer_compilation(jit, asm); return Some(EndBlock); } @@ -9499,7 +9562,7 @@ fn gen_objtostring( jit: &mut JITState, asm: &mut Assembler, ) -> Option { - if !jit.at_current_insn() { + if !jit.at_compile_target() { defer_compilation(jit, asm); return Some(EndBlock); } @@ -9842,7 +9905,7 @@ fn gen_getblockparamproxy( jit: &mut JITState, asm: &mut Assembler, ) -> Option { - if !jit.at_current_insn() { + if !jit.at_compile_target() { defer_compilation(jit, asm); return Some(EndBlock); } @@ -10593,6 +10656,7 @@ mod tests { cb.get_write_ptr(), ptr::null(), // No execution context in tests. No peeking! ocb, + true, ) } diff --git a/yjit/src/core.rs b/yjit/src/core.rs index 4d2425aeb2..1f96dd453f 100644 --- a/yjit/src/core.rs +++ b/yjit/src/core.rs @@ -371,6 +371,12 @@ impl RegMapping { .map(|(reg_idx, _)| reg_idx) } + /// Set a given operand to the register at a given index. + pub fn set_reg(&mut self, opnd: RegOpnd, reg_idx: usize) { + assert!(self.0[reg_idx].is_none()); + self.0[reg_idx] = Some(opnd); + } + /// Allocate a register for a given operand if available. /// Return true if self is updated. pub fn alloc_reg(&mut self, opnd: RegOpnd) -> bool { @@ -435,6 +441,32 @@ impl RegMapping { RegOpnd::Local(_) => index_temps.rev().find(|(_, reg_opnd)| reg_opnd.is_none()), }.map(|(index, _)| index) } + + /// Return a vector of RegOpnds that have an allocated register + pub fn get_reg_opnds(&self) -> Vec { + self.0.iter().filter_map(|®_opnd| reg_opnd).collect() + } + + /// Return TypeDiff::Compatible(diff) if dst has a mapping that can be made by moving registers + /// in self `diff` times. TypeDiff::Incompatible if they have different things in registers. + pub fn diff(&self, dst: RegMapping) -> TypeDiff { + let src_opnds = self.get_reg_opnds(); + let dst_opnds = dst.get_reg_opnds(); + if src_opnds.len() != dst_opnds.len() { + return TypeDiff::Incompatible; + } + + let mut diff = 0; + for ®_opnd in src_opnds.iter() { + match (self.get_reg(reg_opnd), dst.get_reg(reg_opnd)) { + (Some(src_idx), Some(dst_idx)) => if src_idx != dst_idx { + diff += 1; + } + _ => return TypeDiff::Incompatible, + } + } + TypeDiff::Compatible(diff) + } } impl fmt::Debug for RegMapping { @@ -2080,9 +2112,8 @@ pub fn take_version_list(blockid: BlockId) -> VersionList { } } -/// Count the number of block versions matching a given blockid -/// `inlined: true` counts inlined versions, and `inlined: false` counts other versions. -fn get_num_versions(blockid: BlockId, inlined: bool) -> usize { +/// Count the number of block versions that match a given BlockId and part of a Context +fn get_num_versions(blockid: BlockId, ctx: &Context) -> usize { let insn_idx = blockid.idx.as_usize(); match get_iseq_payload(blockid.iseq) { @@ -2094,9 +2125,14 @@ fn get_num_versions(blockid: BlockId, inlined: bool) -> usize { .version_map .get(insn_idx) .map(|versions| { - versions.iter().filter(|&&version| - Context::decode(unsafe { version.as_ref() }.ctx).inline() == inlined - ).count() + versions.iter().filter(|&&version| { + let version_ctx = Context::decode(unsafe { version.as_ref() }.ctx); + // Inline versions are counted separately towards MAX_INLINE_VERSIONS. + version_ctx.inline() == ctx.inline() && + // find_block_versions() finds only blocks with compatible reg_mapping, + // so count only versions with compatible reg_mapping. + version_ctx.reg_mapping == ctx.reg_mapping + }).count() }) .unwrap_or(0) } @@ -2128,10 +2164,7 @@ pub fn get_or_create_iseq_block_list(iseq: IseqPtr) -> Vec { /// Retrieve a basic block version for an (iseq, idx) tuple /// This will return None if no version is found fn find_block_version(blockid: BlockId, ctx: &Context) -> Option { - let versions = match get_version_list(blockid) { - Some(versions) => versions, - None => return None, - }; + let versions = get_version_list(blockid)?; // Best match found let mut best_version: Option = None; @@ -2156,6 +2189,33 @@ fn find_block_version(blockid: BlockId, ctx: &Context) -> Option { return best_version; } +/// Basically find_block_version() but allows RegMapping incompatibility +/// that can be fixed by register moves and returns Context +pub fn find_block_ctx_with_same_regs(blockid: BlockId, ctx: &Context) -> Option { + let versions = get_version_list(blockid)?; + + // Best match found + let mut best_ctx: Option = None; + let mut best_diff = usize::MAX; + + // For each version matching the blockid + for blockref in versions.iter() { + let block = unsafe { blockref.as_ref() }; + let block_ctx = Context::decode(block.ctx); + + // Discover the best block that is compatible if we move registers + match ctx.diff_with_same_regs(&block_ctx) { + TypeDiff::Compatible(diff) if diff < best_diff => { + best_ctx = Some(block_ctx); + best_diff = diff; + } + _ => {} + } + } + + best_ctx +} + /// Allow inlining a Block up to MAX_INLINE_VERSIONS times. const MAX_INLINE_VERSIONS: usize = 1000; @@ -2166,7 +2226,7 @@ pub fn limit_block_versions(blockid: BlockId, ctx: &Context) -> Context { return *ctx; } - let next_versions = get_num_versions(blockid, ctx.inline()) + 1; + let next_versions = get_num_versions(blockid, ctx) + 1; let max_versions = if ctx.inline() { MAX_INLINE_VERSIONS } else { @@ -2782,8 +2842,24 @@ impl Context { return TypeDiff::Compatible(diff); } + /// Basically diff() but allows RegMapping incompatibility that can be fixed + /// by register moves. + pub fn diff_with_same_regs(&self, dst: &Context) -> TypeDiff { + // Prepare a Context with the same registers + let mut dst_with_same_regs = dst.clone(); + dst_with_same_regs.set_reg_mapping(self.get_reg_mapping()); + + // Diff registers and other stuff separately, and merge them + match (self.diff(&dst_with_same_regs), self.get_reg_mapping().diff(dst.get_reg_mapping())) { + (TypeDiff::Compatible(ctx_diff), TypeDiff::Compatible(reg_diff)) => { + TypeDiff::Compatible(ctx_diff + reg_diff) + } + _ => TypeDiff::Incompatible + } + } + pub fn two_fixnums_on_stack(&self, jit: &mut JITState) -> Option { - if jit.at_current_insn() { + if jit.at_compile_target() { let comptime_recv = jit.peek_at_stack(self, 1); let comptime_arg = jit.peek_at_stack(self, 0); return Some(comptime_recv.fixnum_p() && comptime_arg.fixnum_p()); @@ -2955,7 +3031,7 @@ fn gen_block_series_body( let mut batch = Vec::with_capacity(EXPECTED_BATCH_SIZE); // Generate code for the first block - let first_block = gen_single_block(blockid, start_ctx, ec, cb, ocb).ok()?; + let first_block = gen_single_block(blockid, start_ctx, ec, cb, ocb, true).ok()?; batch.push(first_block); // Keep track of this block version // Add the block version to the VersionMap for this ISEQ @@ -2996,7 +3072,7 @@ fn gen_block_series_body( // Generate new block using context from the last branch. let requested_ctx = Context::decode(requested_ctx); - let result = gen_single_block(requested_blockid, &requested_ctx, ec, cb, ocb); + let result = gen_single_block(requested_blockid, &requested_ctx, ec, cb, ocb, false); // If the block failed to compile if result.is_err() { @@ -4312,7 +4388,7 @@ mod tests { let cb = CodeBlock::new_dummy(1024); let mut ocb = OutlinedCb::wrap(CodeBlock::new_dummy(1024)); let dumm_addr = cb.get_write_ptr(); - let block = JITState::new(blockid, Context::default(), dumm_addr, ptr::null(), &mut ocb) + let block = JITState::new(blockid, Context::default(), dumm_addr, ptr::null(), &mut ocb, true) .into_block(0, dumm_addr, dumm_addr, vec![]); let _dropper = BlockDropper(block);