From 888ba29e462075472776098f4f95eb6d3df8e730 Mon Sep 17 00:00:00 2001 From: Takashi Kokubun Date: Mon, 12 Jun 2023 09:54:06 -0700 Subject: [PATCH] YJIT: Break register cycles for C arguments (#7918) --- yjit/src/backend/arm64/mod.rs | 15 ++-- yjit/src/backend/ir.rs | 97 ++++++++++++++++++++++++-- yjit/src/backend/x86_64/mod.rs | 122 +++++++++++++++++++++++++++++++-- 3 files changed, 220 insertions(+), 14 deletions(-) diff --git a/yjit/src/backend/arm64/mod.rs b/yjit/src/backend/arm64/mod.rs index d3a386d42e..9787b1a4e9 100644 --- a/yjit/src/backend/arm64/mod.rs +++ b/yjit/src/backend/arm64/mod.rs @@ -74,6 +74,7 @@ impl From for A64Opnd { Opnd::Mem(Mem { base: MemBase::InsnOut(_), .. }) => { panic!("attempted to lower an Opnd::Mem with a MemBase::InsnOut base") }, + Opnd::CArg(_) => panic!("attempted to lower an Opnd::CArg"), Opnd::InsnOut { .. } => panic!("attempted to lower an Opnd::InsnOut"), Opnd::Value(_) => panic!("attempted to lower an Opnd::Value"), Opnd::Stack { .. } => panic!("attempted to lower an Opnd::Stack"), @@ -185,9 +186,10 @@ fn emit_load_value(cb: &mut CodeBlock, rd: A64Opnd, value: u64) -> usize { impl Assembler { - // A special scratch register for intermediate processing. + // Special scratch registers for intermediate processing. // This register is caller-saved (so we don't have to save it before using it) - const SCRATCH0: A64Opnd = A64Opnd::Reg(X16_REG); + pub const SCRATCH_REG: Reg = X16_REG; + const SCRATCH0: A64Opnd = A64Opnd::Reg(Assembler::SCRATCH_REG); const SCRATCH1: A64Opnd = A64Opnd::Reg(X17_REG); /// List of registers that can be used for stack temps. @@ -280,7 +282,7 @@ impl Assembler /// do follow that encoding, and if they don't then we load them first. fn split_bitmask_immediate(asm: &mut Assembler, opnd: Opnd, dest_num_bits: u8) -> Opnd { match opnd { - Opnd::Reg(_) | Opnd::InsnOut { .. } | Opnd::Stack { .. } => opnd, + Opnd::Reg(_) | Opnd::CArg(_) | Opnd::InsnOut { .. } | Opnd::Stack { .. } => opnd, Opnd::Mem(_) => split_load_operand(asm, opnd), Opnd::Imm(imm) => { if imm == 0 { @@ -313,7 +315,7 @@ impl Assembler /// a certain size. If they don't then we need to load them first. fn split_shifted_immediate(asm: &mut Assembler, opnd: Opnd) -> Opnd { match opnd { - Opnd::Reg(_) | Opnd::InsnOut { .. } => opnd, + Opnd::Reg(_) | Opnd::CArg(_) | Opnd::InsnOut { .. } => opnd, Opnd::Mem(_) => split_load_operand(asm, opnd), Opnd::Imm(_) => asm.load(opnd), Opnd::UImm(uimm) => { @@ -452,7 +454,7 @@ impl Assembler _ => *opnd }; - asm.load_into(C_ARG_OPNDS[idx], value); + asm.load_into(Opnd::c_arg(C_ARG_OPNDS[idx]), value); } // Now we push the CCall without any arguments so that it @@ -924,6 +926,9 @@ impl Assembler let ptr_offset: u32 = (cb.get_write_pos() as u32) - (SIZEOF_VALUE as u32); insn_gc_offsets.push(ptr_offset); }, + Opnd::CArg { .. } => { + unreachable!("C argument operand was not lowered before arm64_emit"); + } Opnd::Stack { .. } => { unreachable!("Stack operand was not lowered before arm64_emit"); } diff --git a/yjit/src/backend/ir.rs b/yjit/src/backend/ir.rs index 8c4a6c1b6d..d09bf892ab 100644 --- a/yjit/src/backend/ir.rs +++ b/yjit/src/backend/ir.rs @@ -72,6 +72,9 @@ pub enum Opnd // Immediate Ruby value, may be GC'd, movable Value(VALUE), + /// C argument register. The alloc_regs resolves its register dependencies. + CArg(Reg), + // Output of a preceding instruction in this block InsnOut{ idx: usize, num_bits: u8 }, @@ -102,6 +105,7 @@ impl fmt::Debug for Opnd { match self { Self::None => write!(fmt, "None"), Value(val) => write!(fmt, "Value({val:?})"), + CArg(reg) => write!(fmt, "CArg({reg:?})"), Stack { idx, sp_offset, .. } => write!(fmt, "SP[{}]", *sp_offset as i32 - idx - 1), InsnOut { idx, num_bits } => write!(fmt, "Out{num_bits}({idx})"), Imm(signed) => write!(fmt, "{signed:x}_i64"), @@ -145,6 +149,14 @@ impl Opnd Opnd::UImm(ptr as u64) } + /// Constructor for a C argument operand + pub fn c_arg(reg_opnd: Opnd) -> Self { + match reg_opnd { + Opnd::Reg(reg) => Opnd::CArg(reg), + _ => unreachable!(), + } + } + pub fn is_some(&self) -> bool { match *self { Opnd::None => false, @@ -1224,6 +1236,55 @@ impl Assembler } } + // Reorder C argument moves, sometimes adding extra moves using SCRATCH_REG, + // so that they will not rewrite each other before they are used. + fn reorder_c_args(c_args: &Vec<(Reg, Opnd)>) -> Vec<(Reg, Opnd)> { + // Return the index of a move whose destination is not used as a source if any. + fn find_safe_arg(c_args: &Vec<(Reg, Opnd)>) -> Option { + c_args.iter().enumerate().find(|(_, &(dest_reg, _))| { + c_args.iter().all(|&(_, src_opnd)| src_opnd != Opnd::Reg(dest_reg)) + }).map(|(index, _)| index) + } + + // Remove moves whose source and destination are the same + let mut c_args: Vec<(Reg, Opnd)> = c_args.clone().into_iter() + .filter(|&(reg, opnd)| Opnd::Reg(reg) != opnd).collect(); + + let mut moves = vec![]; + while c_args.len() > 0 { + // Keep taking safe moves + while let Some(index) = find_safe_arg(&c_args) { + moves.push(c_args.remove(index)); + } + + // No safe move. Load the source of one move into SCRATCH_REG, and + // then load SCRATCH_REG into the destination when it's safe. + if c_args.len() > 0 { + // Make sure it's safe to use SCRATCH_REG + assert!(c_args.iter().all(|&(_, opnd)| opnd != Opnd::Reg(Assembler::SCRATCH_REG))); + + // Move SCRATCH <- opnd, and delay reg <- SCRATCH + let (reg, opnd) = c_args.remove(0); + moves.push((Assembler::SCRATCH_REG, opnd)); + c_args.push((reg, Opnd::Reg(Assembler::SCRATCH_REG))); + } + } + moves + } + + // Adjust the number of entries in live_ranges so that it can be indexed by mapped indexes. + fn shift_live_ranges(live_ranges: &mut Vec, start_index: usize, shift_offset: isize) { + if shift_offset >= 0 { + for index in 0..(shift_offset as usize) { + live_ranges.insert(start_index + index, start_index + index); + } + } else { + for _ in 0..-shift_offset { + live_ranges.remove(start_index); + } + } + } + // Dump live registers for register spill debugging. fn dump_live_regs(insns: Vec, live_ranges: Vec, num_regs: usize, spill_index: usize) { // Convert live_ranges to live_regs: the number of live registers at each index @@ -1247,11 +1308,18 @@ impl Assembler } } + // We may need to reorder LoadInto instructions with a C argument operand. + // This buffers the operands of such instructions to process them in batches. + let mut c_args: Vec<(Reg, Opnd)> = vec![]; + + // live_ranges is indexed by original `index` given by the iterator. let live_ranges: Vec = take(&mut self.live_ranges); + // shifted_live_ranges is indexed by mapped indexes in insn operands. + let mut shifted_live_ranges: Vec = live_ranges.clone(); let mut asm = Assembler::new_with_label_names(take(&mut self.label_names), take(&mut self.side_exits)); let mut iterator = self.into_draining_iter(); - while let Some((index, mut insn)) = iterator.next_unmapped() { + while let Some((index, mut insn)) = iterator.next_mapped() { // Check if this is the last instruction that uses an operand that // spans more than one instruction. In that case, return the // allocated register to the pool. @@ -1262,12 +1330,11 @@ impl Assembler // Since we have an InsnOut, we know it spans more that one // instruction. let start_index = *idx; - assert!(start_index < index); // We're going to check if this is the last instruction that // uses this operand. If it is, we can return the allocated // register to the pool. - if live_ranges[start_index] == index { + if shifted_live_ranges[start_index] == index { if let Some(Opnd::Reg(reg)) = asm.insns[start_index].out_opnd() { dealloc_reg(&mut pool, ®s, reg); } else { @@ -1371,7 +1438,27 @@ impl Assembler } } - asm.push_insn(insn); + // Push instruction(s). Batch and reorder C argument operations if needed. + if let Insn::LoadInto { dest: Opnd::CArg(reg), opnd } = insn { + // Buffer C arguments + c_args.push((reg, opnd)); + } else { + // C arguments are buffered until CCall + if c_args.len() > 0 { + // Resolve C argument dependencies + let c_args_len = c_args.len() as isize; + let moves = reorder_c_args(&c_args.drain(..).into_iter().collect()); + shift_live_ranges(&mut shifted_live_ranges, asm.insns.len(), moves.len() as isize - c_args_len); + + // Push batched C arguments + for (reg, opnd) in moves { + asm.load_into(Opnd::Reg(reg), opnd); + } + } + // Other instructions are pushed as is + asm.push_insn(insn); + } + iterator.map_insn_index(&mut asm); } assert_eq!(pool, 0, "Expected all registers to be returned to the pool"); @@ -1442,7 +1529,7 @@ impl AssemblerDrainingIterator { /// end of the current list of instructions in order to maintain that /// alignment. pub fn map_insn_index(&mut self, asm: &mut Assembler) { - self.indices.push(asm.insns.len() - 1); + self.indices.push(asm.insns.len().saturating_sub(1)); } /// Map an operand by using this iterator's list of mapped indices. diff --git a/yjit/src/backend/x86_64/mod.rs b/yjit/src/backend/x86_64/mod.rs index a2ee94cf66..b40d8b2382 100644 --- a/yjit/src/backend/x86_64/mod.rs +++ b/yjit/src/backend/x86_64/mod.rs @@ -87,9 +87,9 @@ impl From<&Opnd> for X86Opnd { impl Assembler { // A special scratch register for intermediate processing. - // Note: right now this is only used by LeaLabel because label_ref accepts - // a closure and we don't want it to have to capture anything. - const SCRATCH0: X86Opnd = X86Opnd::Reg(R11_REG); + // This register is caller-saved (so we don't have to save it before using it) + pub const SCRATCH_REG: Reg = R11_REG; + const SCRATCH0: X86Opnd = X86Opnd::Reg(Assembler::SCRATCH_REG); /// List of registers that can be used for stack temps. pub const TEMP_REGS: [Reg; 5] = [RSI_REG, RDI_REG, R8_REG, R9_REG, R10_REG]; @@ -347,7 +347,7 @@ impl Assembler // Load each operand into the corresponding argument // register. for (idx, opnd) in opnds.into_iter().enumerate() { - asm.load_into(C_ARG_OPNDS[idx], *opnd); + asm.load_into(Opnd::c_arg(C_ARG_OPNDS[idx]), *opnd); } // Now we push the CCall without any arguments so that it @@ -1055,4 +1055,118 @@ mod tests { assert_eq!(format!("{:x}", cb), "4983f540"); } + + #[test] + fn test_reorder_c_args_no_cycle() { + let (mut asm, mut cb) = setup_asm(); + + asm.ccall(0 as _, vec![ + C_ARG_OPNDS[0], // mov rdi, rdi (optimized away) + C_ARG_OPNDS[1], // mov rsi, rsi (optimized away) + ]); + asm.compile_with_num_regs(&mut cb, 0); + + assert_disasm!(cb, "b800000000ffd0", {" + 0x0: mov eax, 0 + 0x5: call rax + "}); + } + + #[test] + fn test_reorder_c_args_single_cycle() { + let (mut asm, mut cb) = setup_asm(); + + // rdi and rsi form a cycle + asm.ccall(0 as _, vec![ + C_ARG_OPNDS[1], // mov rdi, rsi + C_ARG_OPNDS[0], // mov rsi, rdi + C_ARG_OPNDS[2], // mov rdx, rdx (optimized away) + ]); + asm.compile_with_num_regs(&mut cb, 0); + + assert_disasm!(cb, "4989f34889fe4c89dfb800000000ffd0", {" + 0x0: mov r11, rsi + 0x3: mov rsi, rdi + 0x6: mov rdi, r11 + 0x9: mov eax, 0 + 0xe: call rax + "}); + } + + #[test] + fn test_reorder_c_args_two_cycles() { + let (mut asm, mut cb) = setup_asm(); + + // rdi and rsi form a cycle, and rdx and rcx form another cycle + asm.ccall(0 as _, vec![ + C_ARG_OPNDS[1], // mov rdi, rsi + C_ARG_OPNDS[0], // mov rsi, rdi + C_ARG_OPNDS[3], // mov rdx, rcx + C_ARG_OPNDS[2], // mov rcx, rdx + ]); + asm.compile_with_num_regs(&mut cb, 0); + + assert_disasm!(cb, "4989f34889fe4c89df4989cb4889d14c89dab800000000ffd0", {" + 0x0: mov r11, rsi + 0x3: mov rsi, rdi + 0x6: mov rdi, r11 + 0x9: mov r11, rcx + 0xc: mov rcx, rdx + 0xf: mov rdx, r11 + 0x12: mov eax, 0 + 0x17: call rax + "}); + } + + #[test] + fn test_reorder_c_args_large_cycle() { + let (mut asm, mut cb) = setup_asm(); + + // rdi, rsi, and rdx form a cycle + asm.ccall(0 as _, vec![ + C_ARG_OPNDS[1], // mov rdi, rsi + C_ARG_OPNDS[2], // mov rsi, rdx + C_ARG_OPNDS[0], // mov rdx, rdi + ]); + asm.compile_with_num_regs(&mut cb, 0); + + assert_disasm!(cb, "4989f34889d64889fa4c89dfb800000000ffd0", {" + 0x0: mov r11, rsi + 0x3: mov rsi, rdx + 0x6: mov rdx, rdi + 0x9: mov rdi, r11 + 0xc: mov eax, 0 + 0x11: call rax + "}); + } + + #[test] + fn test_reorder_c_args_with_insn_out() { + let (mut asm, mut cb) = setup_asm(); + + let rax = asm.load(Opnd::UImm(1)); + let rcx = asm.load(Opnd::UImm(2)); + let rdx = asm.load(Opnd::UImm(3)); + // rcx and rdx form a cycle + asm.ccall(0 as _, vec![ + rax, // mov rdi, rax + rcx, // mov rsi, rcx + rcx, // mov rdx, rcx + rdx, // mov rcx, rdx + ]); + asm.compile_with_num_regs(&mut cb, 3); + + assert_disasm!(cb, "b801000000b902000000ba030000004889c74889ce4989cb4889d14c89dab800000000ffd0", {" + 0x0: mov eax, 1 + 0x5: mov ecx, 2 + 0xa: mov edx, 3 + 0xf: mov rdi, rax + 0x12: mov rsi, rcx + 0x15: mov r11, rcx + 0x18: mov rcx, rdx + 0x1b: mov rdx, r11 + 0x1e: mov eax, 0 + 0x23: call rax + "}); + } }