From 888ba29e462075472776098f4f95eb6d3df8e730 Mon Sep 17 00:00:00 2001
From: Takashi Kokubun <takashikkbn@gmail.com>
Date: Mon, 12 Jun 2023 09:54:06 -0700
Subject: [PATCH] YJIT: Break register cycles for C arguments (#7918)

---
 yjit/src/backend/arm64/mod.rs  |  15 ++--
 yjit/src/backend/ir.rs         |  97 ++++++++++++++++++++++++--
 yjit/src/backend/x86_64/mod.rs | 122 +++++++++++++++++++++++++++++++--
 3 files changed, 220 insertions(+), 14 deletions(-)
diff --git a/yjit/src/backend/arm64/mod.rs b/yjit/src/backend/arm64/mod.rs
index d3a386d42e..9787b1a4e9 100644
--- a/yjit/src/backend/arm64/mod.rs
+++ b/yjit/src/backend/arm64/mod.rs
@@ -74,6 +74,7 @@ impl From<Opnd> for A64Opnd {
             Opnd::Mem(Mem { base: MemBase::InsnOut(_), .. }) => {
                 panic!("attempted to lower an Opnd::Mem with a MemBase::InsnOut base")
             },
+            Opnd::CArg(_) => panic!("attempted to lower an Opnd::CArg"),
             Opnd::InsnOut { .. } => panic!("attempted to lower an Opnd::InsnOut"),
             Opnd::Value(_) => panic!("attempted to lower an Opnd::Value"),
             Opnd::Stack { .. } => panic!("attempted to lower an Opnd::Stack"),
@@ -185,9 +186,10 @@ fn emit_load_value(cb: &mut CodeBlock, rd: A64Opnd, value: u64) -> usize {
 
 impl Assembler
 {
-    // A special scratch register for intermediate processing.
+    // Special scratch registers for intermediate processing.
     // This register is caller-saved (so we don't have to save it before using it)
-    const SCRATCH0: A64Opnd = A64Opnd::Reg(X16_REG);
+    pub const SCRATCH_REG: Reg = X16_REG;
+    const SCRATCH0: A64Opnd = A64Opnd::Reg(Assembler::SCRATCH_REG);
     const SCRATCH1: A64Opnd = A64Opnd::Reg(X17_REG);
 
     /// List of registers that can be used for stack temps.
@@ -280,7 +282,7 @@ impl Assembler
         /// do follow that encoding, and if they don't then we load them first.
         fn split_bitmask_immediate(asm: &mut Assembler, opnd: Opnd, dest_num_bits: u8) -> Opnd {
             match opnd {
-                Opnd::Reg(_) | Opnd::InsnOut { .. } | Opnd::Stack { .. } => opnd,
+                Opnd::Reg(_) | Opnd::CArg(_) | Opnd::InsnOut { .. } | Opnd::Stack { .. } => opnd,
                 Opnd::Mem(_) => split_load_operand(asm, opnd),
                 Opnd::Imm(imm) => {
                     if imm == 0 {
@@ -313,7 +315,7 @@ impl Assembler
         /// a certain size. If they don't then we need to load them first.
         fn split_shifted_immediate(asm: &mut Assembler, opnd: Opnd) -> Opnd {
             match opnd {
-                Opnd::Reg(_) | Opnd::InsnOut { .. } => opnd,
+                Opnd::Reg(_) | Opnd::CArg(_) | Opnd::InsnOut { .. } => opnd,
                 Opnd::Mem(_) => split_load_operand(asm, opnd),
                 Opnd::Imm(_) => asm.load(opnd),
                 Opnd::UImm(uimm) => {
@@ -452,7 +454,7 @@ impl Assembler
                             _ => *opnd
                         };
 
-                        asm.load_into(C_ARG_OPNDS[idx], value);
+                        asm.load_into(Opnd::c_arg(C_ARG_OPNDS[idx]), value);
                     }
 
                     // Now we push the CCall without any arguments so that it
@@ -924,6 +926,9 @@ impl Assembler
                             let ptr_offset: u32 = (cb.get_write_pos() as u32) - (SIZEOF_VALUE as u32);
                             insn_gc_offsets.push(ptr_offset);
                         },
+                        Opnd::CArg { .. } => {
+                            unreachable!("C argument operand was not lowered before arm64_emit");
+                        }
                         Opnd::Stack { .. } => {
                             unreachable!("Stack operand was not lowered before arm64_emit");
                         }
diff --git a/yjit/src/backend/ir.rs b/yjit/src/backend/ir.rs
index 8c4a6c1b6d..d09bf892ab 100644
--- a/yjit/src/backend/ir.rs
+++ b/yjit/src/backend/ir.rs
@@ -72,6 +72,9 @@ pub enum Opnd
     // Immediate Ruby value, may be GC'd, movable
     Value(VALUE),
 
+    /// C argument register. The alloc_regs resolves its register dependencies.
+    CArg(Reg),
+
     // Output of a preceding instruction in this block
     InsnOut{ idx: usize, num_bits: u8 },
 
@@ -102,6 +105,7 @@ impl fmt::Debug for Opnd {
         match self {
             Self::None => write!(fmt, "None"),
             Value(val) => write!(fmt, "Value({val:?})"),
+            CArg(reg) => write!(fmt, "CArg({reg:?})"),
             Stack { idx, sp_offset, .. } => write!(fmt, "SP[{}]", *sp_offset as i32 - idx - 1),
             InsnOut { idx, num_bits } => write!(fmt, "Out{num_bits}({idx})"),
             Imm(signed) => write!(fmt, "{signed:x}_i64"),
@@ -145,6 +149,14 @@ impl Opnd
         Opnd::UImm(ptr as u64)
     }
 
+    /// Constructor for a C argument operand
+    pub fn c_arg(reg_opnd: Opnd) -> Self {
+        match reg_opnd {
+            Opnd::Reg(reg) => Opnd::CArg(reg),
+            _ => unreachable!(),
+        }
+    }
+
     pub fn is_some(&self) -> bool {
         match *self {
             Opnd::None => false,
@@ -1224,6 +1236,55 @@ impl Assembler
             }
         }
 
+        // Reorder C argument moves, sometimes adding extra moves using SCRATCH_REG,
+        // so that they will not rewrite each other before they are used.
+        fn reorder_c_args(c_args: &Vec<(Reg, Opnd)>) -> Vec<(Reg, Opnd)> {
+            // Return the index of a move whose destination is not used as a source if any.
+            fn find_safe_arg(c_args: &Vec<(Reg, Opnd)>) -> Option<usize> {
+                c_args.iter().enumerate().find(|(_, &(dest_reg, _))| {
+                    c_args.iter().all(|&(_, src_opnd)| src_opnd != Opnd::Reg(dest_reg))
+                }).map(|(index, _)| index)
+            }
+
+            // Remove moves whose source and destination are the same
+            let mut c_args: Vec<(Reg, Opnd)> = c_args.clone().into_iter()
+                .filter(|&(reg, opnd)| Opnd::Reg(reg) != opnd).collect();
+
+            let mut moves = vec![];
+            while c_args.len() > 0 {
+                // Keep taking safe moves
+                while let Some(index) = find_safe_arg(&c_args) {
+                    moves.push(c_args.remove(index));
+                }
+
+                // No safe move. Load the source of one move into SCRATCH_REG, and
+                // then load SCRATCH_REG into the destination when it's safe.
+                if c_args.len() > 0 {
+                    // Make sure it's safe to use SCRATCH_REG
+                    assert!(c_args.iter().all(|&(_, opnd)| opnd != Opnd::Reg(Assembler::SCRATCH_REG)));
+
+                    // Move SCRATCH <- opnd, and delay reg <- SCRATCH
+                    let (reg, opnd) = c_args.remove(0);
+                    moves.push((Assembler::SCRATCH_REG, opnd));
+                    c_args.push((reg, Opnd::Reg(Assembler::SCRATCH_REG)));
+                }
+            }
+            moves
+        }
+
+        // Adjust the number of entries in live_ranges so that it can be indexed by mapped indexes.
+        fn shift_live_ranges(live_ranges: &mut Vec<usize>, start_index: usize, shift_offset: isize) {
+            if shift_offset >= 0 {
+                for index in 0..(shift_offset as usize) {
+                    live_ranges.insert(start_index + index, start_index + index);
+                }
+            } else {
+                for _ in 0..-shift_offset {
+                    live_ranges.remove(start_index);
+                }
+            }
+        }
+
         // Dump live registers for register spill debugging.
         fn dump_live_regs(insns: Vec<Insn>, live_ranges: Vec<usize>, num_regs: usize, spill_index: usize) {
             // Convert live_ranges to live_regs: the number of live registers at each index
@@ -1247,11 +1308,18 @@ impl Assembler
             }
         }
 
+        // We may need to reorder LoadInto instructions with a C argument operand.
+        // This buffers the operands of such instructions to process them in batches.
+        let mut c_args: Vec<(Reg, Opnd)> = vec![];
+
+        // live_ranges is indexed by original `index` given by the iterator.
         let live_ranges: Vec<usize> = take(&mut self.live_ranges);
+        // shifted_live_ranges is indexed by mapped indexes in insn operands.
+        let mut shifted_live_ranges: Vec<usize> = live_ranges.clone();
         let mut asm = Assembler::new_with_label_names(take(&mut self.label_names), take(&mut self.side_exits));
         let mut iterator = self.into_draining_iter();
 
-        while let Some((index, mut insn)) = iterator.next_unmapped() {
+        while let Some((index, mut insn)) = iterator.next_mapped() {
             // Check if this is the last instruction that uses an operand that
             // spans more than one instruction. In that case, return the
             // allocated register to the pool.
@@ -1262,12 +1330,11 @@ impl Assembler
                         // Since we have an InsnOut, we know it spans more that one
                         // instruction.
                         let start_index = *idx;
-                        assert!(start_index < index);
 
                         // We're going to check if this is the last instruction that
                         // uses this operand. If it is, we can return the allocated
                         // register to the pool.
-                        if live_ranges[start_index] == index {
+                        if shifted_live_ranges[start_index] == index {
                             if let Some(Opnd::Reg(reg)) = asm.insns[start_index].out_opnd() {
                                 dealloc_reg(&mut pool, &regs, reg);
                             } else {
@@ -1371,7 +1438,27 @@ impl Assembler
                 }
             }
 
-            asm.push_insn(insn);
+            // Push instruction(s). Batch and reorder C argument operations if needed.
+            if let Insn::LoadInto { dest: Opnd::CArg(reg), opnd } = insn {
+                // Buffer C arguments
+                c_args.push((reg, opnd));
+            } else {
+                // C arguments are buffered until CCall
+                if c_args.len() > 0 {
+                    // Resolve C argument dependencies
+                    let c_args_len = c_args.len() as isize;
+                    let moves = reorder_c_args(&c_args.drain(..).into_iter().collect());
+                    shift_live_ranges(&mut shifted_live_ranges, asm.insns.len(), moves.len() as isize - c_args_len);
+
+                    // Push batched C arguments
+                    for (reg, opnd) in moves {
+                        asm.load_into(Opnd::Reg(reg), opnd);
+                    }
+                }
+                // Other instructions are pushed as is
+                asm.push_insn(insn);
+            }
+            iterator.map_insn_index(&mut asm);
         }
 
         assert_eq!(pool, 0, "Expected all registers to be returned to the pool");
@@ -1442,7 +1529,7 @@ impl AssemblerDrainingIterator {
     /// end of the current list of instructions in order to maintain that
     /// alignment.
     pub fn map_insn_index(&mut self, asm: &mut Assembler) {
-        self.indices.push(asm.insns.len() - 1);
+        self.indices.push(asm.insns.len().saturating_sub(1));
     }
 
     /// Map an operand by using this iterator's list of mapped indices.
diff --git a/yjit/src/backend/x86_64/mod.rs b/yjit/src/backend/x86_64/mod.rs
index a2ee94cf66..b40d8b2382 100644
--- a/yjit/src/backend/x86_64/mod.rs
+++ b/yjit/src/backend/x86_64/mod.rs
@@ -87,9 +87,9 @@ impl From<&Opnd> for X86Opnd {
 impl Assembler
 {
     // A special scratch register for intermediate processing.
-    // Note: right now this is only used by LeaLabel because label_ref accepts
-    // a closure and we don't want it to have to capture anything.
-    const SCRATCH0: X86Opnd = X86Opnd::Reg(R11_REG);
+    // This register is caller-saved (so we don't have to save it before using it)
+    pub const SCRATCH_REG: Reg = R11_REG;
+    const SCRATCH0: X86Opnd = X86Opnd::Reg(Assembler::SCRATCH_REG);
 
     /// List of registers that can be used for stack temps.
     pub const TEMP_REGS: [Reg; 5] = [RSI_REG, RDI_REG, R8_REG, R9_REG, R10_REG];
@@ -347,7 +347,7 @@ impl Assembler
                     // Load each operand into the corresponding argument
                     // register.
                     for (idx, opnd) in opnds.into_iter().enumerate() {
-                        asm.load_into(C_ARG_OPNDS[idx], *opnd);
+                        asm.load_into(Opnd::c_arg(C_ARG_OPNDS[idx]), *opnd);
                     }
 
                     // Now we push the CCall without any arguments so that it
@@ -1055,4 +1055,118 @@ mod tests {
 
         assert_eq!(format!("{:x}", cb), "4983f540");
     }
+
+    #[test]
+    fn test_reorder_c_args_no_cycle() {
+        let (mut asm, mut cb) = setup_asm();
+
+        asm.ccall(0 as _, vec![
+            C_ARG_OPNDS[0], // mov rdi, rdi (optimized away)
+            C_ARG_OPNDS[1], // mov rsi, rsi (optimized away)
+        ]);
+        asm.compile_with_num_regs(&mut cb, 0);
+
+        assert_disasm!(cb, "b800000000ffd0", {"
+            0x0: mov eax, 0
+            0x5: call rax
+        "});
+    }
+
+    #[test]
+    fn test_reorder_c_args_single_cycle() {
+        let (mut asm, mut cb) = setup_asm();
+
+        // rdi and rsi form a cycle
+        asm.ccall(0 as _, vec![
+            C_ARG_OPNDS[1], // mov rdi, rsi
+            C_ARG_OPNDS[0], // mov rsi, rdi
+            C_ARG_OPNDS[2], // mov rdx, rdx (optimized away)
+        ]);
+        asm.compile_with_num_regs(&mut cb, 0);
+
+        assert_disasm!(cb, "4989f34889fe4c89dfb800000000ffd0", {"
+            0x0: mov r11, rsi
+            0x3: mov rsi, rdi
+            0x6: mov rdi, r11
+            0x9: mov eax, 0
+            0xe: call rax
+        "});
+    }
+
+    #[test]
+    fn test_reorder_c_args_two_cycles() {
+        let (mut asm, mut cb) = setup_asm();
+
+        // rdi and rsi form a cycle, and rdx and rcx form another cycle
+        asm.ccall(0 as _, vec![
+            C_ARG_OPNDS[1], // mov rdi, rsi
+            C_ARG_OPNDS[0], // mov rsi, rdi
+            C_ARG_OPNDS[3], // mov rdx, rcx
+            C_ARG_OPNDS[2], // mov rcx, rdx
+        ]);
+        asm.compile_with_num_regs(&mut cb, 0);
+
+        assert_disasm!(cb, "4989f34889fe4c89df4989cb4889d14c89dab800000000ffd0", {"
+            0x0: mov r11, rsi
+            0x3: mov rsi, rdi
+            0x6: mov rdi, r11
+            0x9: mov r11, rcx
+            0xc: mov rcx, rdx
+            0xf: mov rdx, r11
+            0x12: mov eax, 0
+            0x17: call rax
+        "});
+    }
+
+    #[test]
+    fn test_reorder_c_args_large_cycle() {
+        let (mut asm, mut cb) = setup_asm();
+
+        // rdi, rsi, and rdx form a cycle
+        asm.ccall(0 as _, vec![
+            C_ARG_OPNDS[1], // mov rdi, rsi
+            C_ARG_OPNDS[2], // mov rsi, rdx
+            C_ARG_OPNDS[0], // mov rdx, rdi
+        ]);
+        asm.compile_with_num_regs(&mut cb, 0);
+
+        assert_disasm!(cb, "4989f34889d64889fa4c89dfb800000000ffd0", {"
+            0x0: mov r11, rsi
+            0x3: mov rsi, rdx
+            0x6: mov rdx, rdi
+            0x9: mov rdi, r11
+            0xc: mov eax, 0
+            0x11: call rax
+        "});
+    }
+
+    #[test]
+    fn test_reorder_c_args_with_insn_out() {
+        let (mut asm, mut cb) = setup_asm();
+
+        let rax = asm.load(Opnd::UImm(1));
+        let rcx = asm.load(Opnd::UImm(2));
+        let rdx = asm.load(Opnd::UImm(3));
+        // rcx and rdx form a cycle
+        asm.ccall(0 as _, vec![
+            rax, // mov rdi, rax
+            rcx, // mov rsi, rcx
+            rcx, // mov rdx, rcx
+            rdx, // mov rcx, rdx
+        ]);
+        asm.compile_with_num_regs(&mut cb, 3);
+
+        assert_disasm!(cb, "b801000000b902000000ba030000004889c74889ce4989cb4889d14c89dab800000000ffd0", {"
+            0x0: mov eax, 1
+            0x5: mov ecx, 2
+            0xa: mov edx, 3
+            0xf: mov rdi, rax
+            0x12: mov rsi, rcx
+            0x15: mov r11, rcx
+            0x18: mov rcx, rdx
+            0x1b: mov rdx, r11
+            0x1e: mov eax, 0
+            0x23: call rax
+        "});
+    }
 }