YJIT: Interleave inline and outlined code blocks (#6460)

Co-authored-by: Alan Wu <alansi.xingwu@shopify.com> Co-authored-by: Maxime Chevalier-Boisvert <maxime.chevalierboisvert@shopify.com>
Merged-By: k0kubun <takashikkbn@gmail.com>
2022-10-17 10:45:59 -07:00 · 2022-10-17 10:45:59 -07:00 · 64c52c4282 · 2022-10-17 17:46:19 +00:00
commit 64c52c4282
parent e7c71c6c92
9 changed files with 379 additions and 162 deletions
--- a/yjit/src/asm/mod.rs
+++ b/yjit/src/asm/mod.rs
@ -1,9 +1,20 @@
+use std::cell::RefCell;
+use std::cmp;
 use std::fmt;
 use std::mem;
+use std::rc::Rc;
+#[cfg(target_arch = "x86_64")]
+use crate::backend::x86_64::JMP_PTR_BYTES;
+#[cfg(target_arch = "aarch64")]
+use crate::backend::arm64::JMP_PTR_BYTES;
+use crate::backend::ir::Assembler;
+use crate::backend::ir::Target;
+use crate::virtualmem::WriteError;

 #[cfg(feature = "asm_comments")]
 use std::collections::BTreeMap;

+use crate::codegen::CodegenGlobals;
 use crate::virtualmem::{VirtualMem, CodePtr};

 // Lots of manual vertical alignment in there that rustfmt doesn't handle well.
@ -17,7 +28,8 @@ pub mod arm64;
 //

 /// Reference to an ASM label
-struct LabelRef {
+#[derive(Clone)]
+pub struct LabelRef {
    // Position in the code block where the label reference exists
    pos: usize,

@ -36,7 +48,7 @@ struct LabelRef {
 /// Block of memory into which instructions can be assembled
 pub struct CodeBlock {
    // Memory for storing the encoded instructions
-    mem_block: VirtualMem,
+    mem_block: Rc<RefCell<VirtualMem>>,

    // Memory block size
    mem_size: usize,
@ -44,6 +56,12 @@ pub struct CodeBlock {
    // Current writing position
    write_pos: usize,

+    // Size of a code page (inlined + outlined)
+    page_size: usize,
+
+    // Size reserved for writing a jump to the next page
+    page_end_reserve: usize,
+
    // Table of registered label addresses
    label_addrs: Vec<usize>,

@ -58,7 +76,6 @@ pub struct CodeBlock {
    asm_comments: BTreeMap<usize, Vec<String>>,

    // True for OutlinedCb
-    #[cfg(feature = "disasm")]
    pub outlined: bool,

    // Set if the CodeBlock is unable to output some instructions,
@ -67,27 +84,158 @@ pub struct CodeBlock {
    dropped_bytes: bool,
 }

+/// Set of CodeBlock label states. Used for recovering the previous state.
+pub struct LabelState {
+    label_addrs: Vec<usize>,
+    label_names: Vec<String>,
+    label_refs: Vec<LabelRef>,
+}
+
 impl CodeBlock {
    /// Make a new CodeBlock
-    pub fn new(mem_block: VirtualMem, outlined: bool) -> Self {
-        Self {
-            mem_size: mem_block.virtual_region_size(),
+    pub fn new(mem_block: Rc<RefCell<VirtualMem>>, page_size: usize, outlined: bool) -> Self {
+        let mem_size = mem_block.borrow().virtual_region_size();
+        let mut cb = Self {
            mem_block,
+            mem_size,
            write_pos: 0,
+            page_size,
+            page_end_reserve: JMP_PTR_BYTES,
            label_addrs: Vec::new(),
            label_names: Vec::new(),
            label_refs: Vec::new(),
            #[cfg(feature = "asm_comments")]
            asm_comments: BTreeMap::new(),
-            #[cfg(feature = "disasm")]
            outlined,
            dropped_bytes: false,
+        };
+        cb.write_pos = cb.page_start();
+        cb
+    }
+
+    /// Move the CodeBlock to the next page. If it's on the furthest page,
+    /// move the other CodeBlock to the next page as well.
+    pub fn next_page<F: Fn(&mut CodeBlock, CodePtr)>(&mut self, base_ptr: CodePtr, jmp_ptr: F) -> bool {
+        let old_write_ptr = self.get_write_ptr();
+        self.set_write_ptr(base_ptr);
+        self.without_page_end_reserve(|cb| assert!(cb.has_capacity(JMP_PTR_BYTES)));
+
+        // Move self to the next page
+        let next_page_idx = self.write_pos / self.page_size + 1;
+        if !self.set_page(next_page_idx, &jmp_ptr) {
+            self.set_write_ptr(old_write_ptr); // rollback if there are no more pages
+            return false;
        }
+
+        // Move the other CodeBlock to the same page if it'S on the furthest page
+        self.other_cb().unwrap().set_page(next_page_idx, &jmp_ptr);
+
+        return !self.dropped_bytes;
+    }
+
+    /// Move the CodeBlock to page_idx only if it's not going backwards.
+    fn set_page<F: Fn(&mut CodeBlock, CodePtr)>(&mut self, page_idx: usize, jmp_ptr: &F) -> bool {
+        // Do not move the CodeBlock if page_idx points to an old position so that this
+        // CodeBlock will not overwrite existing code.
+        //
+        // Let's say this is the current situation:
+        //   cb: [page1, page2, page3 (write_pos)], ocb: [page1, page2, page3 (write_pos)]
+        //
+        // When cb needs to patch page1, this will be temporarily changed to:
+        //   cb: [page1 (write_pos), page2, page3], ocb: [page1, page2, page3 (write_pos)]
+        //
+        // While patching page1, cb may need to jump to page2. What set_page currently does is:
+        //   cb: [page1, page2 (write_pos), page3], ocb: [page1, page2, page3 (write_pos)]
+        // instead of:
+        //   cb: [page1, page2 (write_pos), page3], ocb: [page1, page2 (write_pos), page3]
+        // because moving ocb's write_pos from page3 to the beginning of page2 will let ocb's
+        // write_pos point to existing code in page2, which might let ocb overwrite it later.
+        //
+        // We could remember the last write_pos in page2 and let set_page use that position,
+        // but you need to waste some space for keeping write_pos for every single page.
+        // It doesn't seem necessary for performance either. So we're currently not doing it.
+        let mut dst_pos = self.page_size * page_idx + self.page_start();
+        if self.page_size * page_idx < self.mem_size && self.write_pos < dst_pos {
+            // Reset dropped_bytes
+            self.dropped_bytes = false;
+
+            // Convert dst_pos to dst_ptr
+            let src_pos = self.write_pos;
+            self.write_pos = dst_pos;
+            let dst_ptr = self.get_write_ptr();
+            self.write_pos = src_pos;
+
+            // Generate jmp_ptr from src_pos to dst_pos
+            self.without_page_end_reserve(|cb| {
+                cb.add_comment("jump to next page");
+                jmp_ptr(cb, dst_ptr);
+                assert!(!cb.has_dropped_bytes());
+            });
+
+            // Start the next code from dst_pos
+            self.write_pos = dst_pos;
+        }
+        !self.dropped_bytes
+    }
+
+    /// write_pos of the current page start
+    pub fn page_start_pos(&self) -> usize {
+        self.get_write_pos() / self.page_size * self.page_size + self.page_start()
+    }
+
+    /// Offset of each page where CodeBlock should start writing
+    pub fn page_start(&self) -> usize {
+        let mut start = if self.inline() {
+            0
+        } else {
+            self.page_size / 2
+        };
+        if cfg!(debug_assertions) && !cfg!(test) {
+            // Leave illegal instructions at the beginning of each page to assert
+            // we're not accidentally crossing page boundaries.
+            start += JMP_PTR_BYTES;
+        }
+        start
+    }
+
+    /// Offset of each page where CodeBlock should stop writing (exclusive)
+    pub fn page_end(&self) -> usize {
+        let page_end = if self.inline() {
+            self.page_size / 2
+        } else {
+            self.page_size
+        };
+        page_end - self.page_end_reserve // reserve space to jump to the next page
+    }
+
+    /// Call a given function with page_end_reserve = 0
+    pub fn without_page_end_reserve<F: Fn(&mut Self)>(&mut self, block: F) {
+        let old_page_end_reserve = self.page_end_reserve;
+        self.page_end_reserve = 0;
+        block(self);
+        self.page_end_reserve = old_page_end_reserve;
+    }
+
+    /// Return the address ranges of a given address range that this CodeBlock can write.
+    pub fn writable_addrs(&self, start_ptr: CodePtr, end_ptr: CodePtr) -> Vec<(usize, usize)> {
+        let mut addrs = vec![];
+        let mut start = start_ptr.raw_ptr() as usize;
+        let codeblock_end = self.get_ptr(self.get_mem_size()).raw_ptr() as usize;
+        let end = std::cmp::min(end_ptr.raw_ptr() as usize, codeblock_end);
+        while start < end {
+            let current_page = start / self.page_size * self.page_size;
+            let page_end = std::cmp::min(end, current_page + self.page_end()) as usize;
+            addrs.push((start, page_end));
+            start = current_page + self.page_size + self.page_start();
+        }
+        addrs
    }

    /// Check if this code block has sufficient remaining capacity
    pub fn has_capacity(&self, num_bytes: usize) -> bool {
-        self.write_pos + num_bytes < self.mem_size
+        let page_offset = self.write_pos % self.page_size;
+        let capacity = self.page_end().saturating_sub(page_offset);
+        num_bytes <= capacity
    }

    /// Add an assembly comment if the feature is on.
@ -121,8 +269,8 @@ impl CodeBlock {
        self.write_pos
    }

-    pub fn get_mem(&mut self) -> &mut VirtualMem {
-        &mut self.mem_block
+    pub fn write_mem(&self, write_ptr: CodePtr, byte: u8) -> Result<(), WriteError> {
+        self.mem_block.borrow_mut().write_byte(write_ptr, byte)
    }

    // Set the current write position
@ -134,49 +282,31 @@ impl CodeBlock {
        self.write_pos = pos;
    }

-    // Align the current write pointer to a multiple of bytes
-    pub fn align_pos(&mut self, multiple: u32) {
-        // Compute the alignment boundary that is lower or equal
-        // Do everything with usize
-        let multiple: usize = multiple.try_into().unwrap();
-        let pos = self.get_write_ptr().raw_ptr() as usize;
-        let remainder = pos % multiple;
-        let prev_aligned = pos - remainder;
-
-        if prev_aligned == pos {
-            // Already aligned so do nothing
-        } else {
-            // Align by advancing
-            let pad = multiple - remainder;
-            self.set_pos(self.get_write_pos() + pad);
-        }
-    }
-
    // Set the current write position from a pointer
    pub fn set_write_ptr(&mut self, code_ptr: CodePtr) {
-        let pos = code_ptr.into_usize() - self.mem_block.start_ptr().into_usize();
+        let pos = code_ptr.into_usize() - self.mem_block.borrow().start_ptr().into_usize();
        self.set_pos(pos);
    }

    /// Get a (possibly dangling) direct pointer into the executable memory block
    pub fn get_ptr(&self, offset: usize) -> CodePtr {
-        self.mem_block.start_ptr().add_bytes(offset)
+        self.mem_block.borrow().start_ptr().add_bytes(offset)
    }

    /// Get a (possibly dangling) direct pointer to the current write position
-    pub fn get_write_ptr(&mut self) -> CodePtr {
+    pub fn get_write_ptr(&self) -> CodePtr {
        self.get_ptr(self.write_pos)
    }

    /// Write a single byte at the current position.
    pub fn write_byte(&mut self, byte: u8) {
        let write_ptr = self.get_write_ptr();
-
-        if self.mem_block.write_byte(write_ptr, byte).is_ok() {
-            self.write_pos += 1;
-        } else {
+        if !self.has_capacity(1) || self.mem_block.borrow_mut().write_byte(write_ptr, byte).is_err() {
            self.dropped_bytes = true;
        }
+
+        // Always advance write_pos since arm64 PadEntryExit needs this to stop the loop.
+        self.write_pos += 1;
    }

    /// Write multiple bytes starting from the current position.
@ -242,6 +372,9 @@ impl CodeBlock {
        self.label_refs.push(LabelRef { pos: self.write_pos, label_idx, num_bytes, encode });

        // Move past however many bytes the instruction takes up
+        if !self.has_capacity(num_bytes) {
+            self.dropped_bytes = true; // retry emitting the Insn after next_page
+        }
        self.write_pos += num_bytes;
    }

@ -274,14 +407,43 @@ impl CodeBlock {
        assert!(self.label_refs.is_empty());
    }

-    pub fn mark_all_executable(&mut self) {
-        self.mem_block.mark_all_executable();
+    pub fn clear_labels(&mut self) {
+        self.label_addrs.clear();
+        self.label_names.clear();
+        self.label_refs.clear();
+    }
+
+    pub fn get_label_state(&self) -> LabelState {
+        LabelState {
+            label_addrs: self.label_addrs.clone(),
+            label_names: self.label_names.clone(),
+            label_refs: self.label_refs.clone(),
+        }
+    }
+
+    pub fn set_label_state(&mut self, state: LabelState) {
+        self.label_addrs = state.label_addrs;
+        self.label_names = state.label_names;
+        self.label_refs = state.label_refs;
+    }
+
+    pub fn mark_all_executable(&mut self) {
+        self.mem_block.borrow_mut().mark_all_executable();
    }

-    #[cfg(feature = "disasm")]
    pub fn inline(&self) -> bool {
        !self.outlined
    }
+
+    pub fn other_cb(&self) -> Option<&'static mut Self> {
+        if !CodegenGlobals::has_instance() {
+            None
+        } else if self.inline() {
+            Some(CodegenGlobals::get_outlined_cb().unwrap())
+        } else {
+            Some(CodegenGlobals::get_inline_cb())
+        }
+    }
 }

 #[cfg(test)]
@ -295,7 +457,7 @@ impl CodeBlock {
        let mem_start: *const u8 = alloc.mem_start();
        let virt_mem = VirtualMem::new(alloc, 1, mem_start as *mut u8, mem_size);

-        Self::new(virt_mem, false)
+        Self::new(Rc::new(RefCell::new(virt_mem)), 16 * 1024, false)
    }
 }

@ -303,7 +465,7 @@ impl CodeBlock {
 impl fmt::LowerHex for CodeBlock {
    fn fmt(&self, fmtr: &mut fmt::Formatter) -> fmt::Result {
        for pos in 0..self.write_pos {
-            let byte = unsafe { self.mem_block.start_ptr().raw_ptr().add(pos).read() };
+            let byte = unsafe { self.mem_block.borrow().start_ptr().raw_ptr().add(pos).read() };
            fmtr.write_fmt(format_args!("{:02x}", byte))?;
        }
        Ok(())
--- a/yjit/src/backend/arm64/mod.rs
+++ b/yjit/src/backend/arm64/mod.rs
@ -4,7 +4,7 @@

 use crate::asm::{CodeBlock};
 use crate::asm::arm64::*;
-use crate::codegen::{JITState};
+use crate::codegen::{JITState, CodegenGlobals};
 use crate::cruby::*;
 use crate::backend::ir::*;
 use crate::virtualmem::CodePtr;
@ -36,6 +36,9 @@ pub const _C_RET_OPND: Opnd = Opnd::Reg(X0_REG);
 pub const C_SP_REG: A64Opnd = X31;
 pub const C_SP_STEP: i32 = 16;

+// The number of bytes that are generated by emit_jmp_ptr
+pub const JMP_PTR_BYTES: usize = 20;
+
 /// Map Opnd to A64Opnd
 impl From<Opnd> for A64Opnd {
    fn from(opnd: Opnd) -> Self {
@ -567,7 +570,7 @@ impl Assembler
        /// Emit the required instructions to load the given value into the
        /// given register. Our goal here is to use as few instructions as
        /// possible to get this value into the register.
-        fn emit_load_value(cb: &mut CodeBlock, rd: A64Opnd, value: u64) -> i32 {
+        fn emit_load_value(cb: &mut CodeBlock, rd: A64Opnd, value: u64) -> usize {
            let mut current = value;

            if current <= 0xffff {
@ -680,6 +683,31 @@ impl Assembler
            ldr_post(cb, opnd, A64Opnd::new_mem(64, C_SP_REG, C_SP_STEP));
        }

+        fn emit_jmp_ptr(cb: &mut CodeBlock, dst_ptr: CodePtr) {
+            let src_addr = cb.get_write_ptr().into_i64();
+            let dst_addr = dst_ptr.into_i64();
+
+            // If the offset is short enough, then we'll use the
+            // branch instruction. Otherwise, we'll move the
+            // destination into a register and use the branch
+            // register instruction.
+            let num_insns = if b_offset_fits_bits((dst_addr - src_addr) / 4) {
+                b(cb, InstructionOffset::from_bytes((dst_addr - src_addr) as i32));
+                1
+            } else {
+                let num_insns = emit_load_value(cb, Assembler::SCRATCH0, dst_addr as u64);
+                br(cb, Assembler::SCRATCH0);
+                num_insns + 1
+            };
+
+            // Make sure it's always a consistent number of
+            // instructions in case it gets patched and has to
+            // use the other branch.
+            for _ in num_insns..(JMP_PTR_BYTES / 4) {
+                nop(cb);
+            }
+        }
+
        // dbg!(&self.insns);

        // List of GC offsets
@ -687,7 +715,13 @@ impl Assembler

        // For each instruction
        let start_write_pos = cb.get_write_pos();
-        for insn in &self.insns {
+        let mut insn_idx: usize = 0;
+        while let Some(insn) = self.insns.get(insn_idx) {
+            let src_ptr = cb.get_write_ptr();
+            let had_dropped_bytes = cb.has_dropped_bytes();
+            let old_label_state = cb.get_label_state();
+            let mut insn_gc_offsets: Vec<u32> = Vec::new();
+
            match insn {
                Insn::Comment(text) => {
                    if cfg!(feature = "asm_comments") {
@ -796,7 +830,7 @@ impl Assembler
                            cb.write_bytes(&value.as_u64().to_le_bytes());

                            let ptr_offset: u32 = (cb.get_write_pos() as u32) - (SIZEOF_VALUE as u32);
-                            gc_offsets.push(ptr_offset);
+                            insn_gc_offsets.push(ptr_offset);
                        },
                        Opnd::None => {
                            unreachable!("Attempted to load from None operand");
@ -904,28 +938,7 @@ impl Assembler
                Insn::Jmp(target) => {
                    match target {
                        Target::CodePtr(dst_ptr) => {
-                            let src_addr = cb.get_write_ptr().into_i64();
-                            let dst_addr = dst_ptr.into_i64();
-
-                            // If the offset is short enough, then we'll use the
-                            // branch instruction. Otherwise, we'll move the
-                            // destination into a register and use the branch
-                            // register instruction.
-                            let num_insns = if b_offset_fits_bits((dst_addr - src_addr) / 4) {
-                                b(cb, InstructionOffset::from_bytes((dst_addr - src_addr) as i32));
-                                0
-                            } else {
-                                let num_insns = emit_load_value(cb, Self::SCRATCH0, dst_addr as u64);
-                                br(cb, Self::SCRATCH0);
-                                num_insns
-                            };
-
-                            // Make sure it's always a consistent number of
-                            // instructions in case it gets patched and has to
-                            // use the other branch.
-                            for _ in num_insns..4 {
-                                nop(cb);
-                            }
+                            emit_jmp_ptr(cb, *dst_ptr);
                        },
                        Target::Label(label_idx) => {
                            // Here we're going to save enough space for
@ -997,13 +1010,21 @@ impl Assembler
                    csel(cb, out.into(), truthy.into(), falsy.into(), Condition::GE);
                }
                Insn::LiveReg { .. } => (), // just a reg alloc signal, no code
-                Insn::PadEntryExit => {
-                    let jmp_len = 5 * 4; // Op::Jmp may emit 5 instructions
-                    while (cb.get_write_pos() - start_write_pos) < jmp_len {
+                Insn::PadInvalPatch => {
+                    while (cb.get_write_pos().saturating_sub(std::cmp::max(start_write_pos, cb.page_start_pos()))) < JMP_PTR_BYTES {
                        nop(cb);
                    }
                }
            };
+
+            // On failure, jump to the next page and retry the current insn
+            if !had_dropped_bytes && cb.has_dropped_bytes() && cb.next_page(src_ptr, emit_jmp_ptr) {
+                // Reset cb states before retrying the current Insn
+                cb.set_label_state(old_label_state);
+            } else {
+                insn_idx += 1;
+                gc_offsets.append(&mut insn_gc_offsets);
+            }
        }

        gc_offsets
@ -1020,21 +1041,23 @@ impl Assembler
            assert!(label_idx == idx);
        }

-        let start_write_pos = cb.get_write_pos();
+        let start_ptr = cb.get_write_ptr();
        let gc_offsets = asm.arm64_emit(cb);

-        if !cb.has_dropped_bytes() {
+        if cb.has_dropped_bytes() {
+            cb.clear_labels();
+        } else {
            cb.link_labels();
-        }

-        // Invalidate icache for newly written out region so we don't run stale code.
-        #[cfg(not(test))]
-        {
-            let start = cb.get_ptr(start_write_pos).raw_ptr();
-            let write_ptr = cb.get_write_ptr().raw_ptr();
-            let codeblock_end = cb.get_ptr(cb.get_mem_size()).raw_ptr();
-            let end = std::cmp::min(write_ptr, codeblock_end);
-            unsafe { rb_yjit_icache_invalidate(start as _, end as _) };
+            // Invalidate icache for newly written out region so we don't run stale code.
+            // It should invalidate only the code ranges of the current cb because the code
+            // ranges of the other cb might have a memory region that is still PROT_NONE.
+            #[cfg(not(test))]
+            cb.without_page_end_reserve(|cb| {
+                for (start, end) in cb.writable_addrs(start_ptr, cb.get_write_ptr()) {
+                    unsafe { rb_yjit_icache_invalidate(start as _, end as _) };
+                }
+            });
        }

        gc_offsets
--- a/yjit/src/backend/ir.rs
+++ b/yjit/src/backend/ir.rs
@ -5,6 +5,7 @@
 use std::cell::Cell;
 use std::fmt;
 use std::convert::From;
+use std::io::Write;
 use std::mem::take;
 use crate::cruby::{VALUE};
 use crate::virtualmem::{CodePtr};
@ -433,9 +434,9 @@ pub enum Insn {
    // binary OR operation.
    Or { left: Opnd, right: Opnd, out: Opnd },

-    /// Pad nop instructions to accomodate Op::Jmp in case the block is
-    /// invalidated.
-    PadEntryExit,
+    /// Pad nop instructions to accomodate Op::Jmp in case the block or the insn
+    /// is invalidated.
+    PadInvalPatch,

    // Mark a position in the generated code
    PosMarker(PosMarkerFn),
@ -521,7 +522,7 @@ impl Insn {
            Insn::Mov { .. } => "Mov",
            Insn::Not { .. } => "Not",
            Insn::Or { .. } => "Or",
-            Insn::PadEntryExit => "PadEntryExit",
+            Insn::PadInvalPatch => "PadEntryExit",
            Insn::PosMarker(_) => "PosMarker",
            Insn::RShift { .. } => "RShift",
            Insn::Store { .. } => "Store",
@ -658,7 +659,7 @@ impl<'a> Iterator for InsnOpndIterator<'a> {
            Insn::Jz(_) |
            Insn::Label(_) |
            Insn::LeaLabel { .. } |
-            Insn::PadEntryExit |
+            Insn::PadInvalPatch |
            Insn::PosMarker(_) => None,
            Insn::CPopInto(opnd) |
            Insn::CPush(opnd) |
@ -755,7 +756,7 @@ impl<'a> InsnOpndMutIterator<'a> {
            Insn::Jz(_) |
            Insn::Label(_) |
            Insn::LeaLabel { .. } |
-            Insn::PadEntryExit |
+            Insn::PadInvalPatch |
            Insn::PosMarker(_) => None,
            Insn::CPopInto(opnd) |
            Insn::CPush(opnd) |
@ -1474,8 +1475,8 @@ impl Assembler {
        out
    }

-    pub fn pad_entry_exit(&mut self) {
-        self.push_insn(Insn::PadEntryExit);
+    pub fn pad_inval_patch(&mut self) {
+        self.push_insn(Insn::PadInvalPatch);
    }

    //pub fn pos_marker<F: FnMut(CodePtr)>(&mut self, marker_fn: F)
--- a/yjit/src/backend/tests.rs
+++ b/yjit/src/backend/tests.rs
@ -231,7 +231,7 @@ fn test_jcc_ptr()
 {
    let (mut asm, mut cb) = setup_asm();

-    let side_exit = Target::CodePtr((5 as *mut u8).into());
+    let side_exit = Target::CodePtr(((cb.get_write_ptr().raw_ptr() as usize + 4) as *mut u8).into());
    let not_mask = asm.not(Opnd::mem(32, EC, RUBY_OFFSET_EC_INTERRUPT_MASK));
    asm.test(
        Opnd::mem(32, EC, RUBY_OFFSET_EC_INTERRUPT_FLAG),
@ -248,7 +248,7 @@ fn test_jmp_ptr()
 {
    let (mut asm, mut cb) = setup_asm();

-    let stub = Target::CodePtr((5 as *mut u8).into());
+    let stub = Target::CodePtr(((cb.get_write_ptr().raw_ptr() as usize + 4) as *mut u8).into());
    asm.jmp(stub);

    asm.compile_with_num_regs(&mut cb, 0);
@ -259,7 +259,7 @@ fn test_jo()
 {
    let (mut asm, mut cb) = setup_asm();

-    let side_exit = Target::CodePtr((5 as *mut u8).into());
+    let side_exit = Target::CodePtr(((cb.get_write_ptr().raw_ptr() as usize + 4) as *mut u8).into());

    let arg1 = Opnd::mem(64, SP, 0);
    let arg0 = Opnd::mem(64, SP, 8);
--- a/yjit/src/backend/x86_64/mod.rs
+++ b/yjit/src/backend/x86_64/mod.rs
@ -9,6 +9,7 @@ use crate::asm::x86_64::*;
 use crate::codegen::{JITState};
 use crate::cruby::*;
 use crate::backend::ir::*;
+use crate::codegen::CodegenGlobals;

 // Use the x86 register type for this platform
 pub type Reg = X86Reg;
@ -32,6 +33,9 @@ pub const _C_ARG_OPNDS: [Opnd; 6] = [
 pub const C_RET_REG: Reg = RAX_REG;
 pub const _C_RET_OPND: Opnd = Opnd::Reg(RAX_REG);

+// The number of bytes that are generated by jmp_ptr
+pub const JMP_PTR_BYTES: usize = 6;
+
 /// Map Opnd to X86Opnd
 impl From<Opnd> for X86Opnd {
    fn from(opnd: Opnd) -> Self {
@ -375,7 +379,13 @@ impl Assembler

        // For each instruction
        let start_write_pos = cb.get_write_pos();
-        for insn in &self.insns {
+        let mut insns_idx: usize = 0;
+        while let Some(insn) = self.insns.get(insns_idx) {
+            let src_ptr = cb.get_write_ptr();
+            let had_dropped_bytes = cb.has_dropped_bytes();
+            let old_label_state = cb.get_label_state();
+            let mut insn_gc_offsets: Vec<u32> = Vec::new();
+
            match insn {
                Insn::Comment(text) => {
                    if cfg!(feature = "asm_comments") {
@ -461,7 +471,7 @@ impl Assembler
                        if !val.special_const_p() {
                            // The pointer immediate is encoded as the last part of the mov written out
                            let ptr_offset: u32 = (cb.get_write_pos() as u32) - (SIZEOF_VALUE as u32);
-                            gc_offsets.push(ptr_offset);
+                            insn_gc_offsets.push(ptr_offset);
                        }
                    }
                },
@ -651,11 +661,10 @@ impl Assembler
                    emit_csel(cb, *truthy, *falsy, *out, cmovl);
                }
                Insn::LiveReg { .. } => (), // just a reg alloc signal, no code
-                Insn::PadEntryExit => {
-                    // We assume that our Op::Jmp usage that gets invalidated is <= 5
-                    let code_size: u32 = (cb.get_write_pos() - start_write_pos).try_into().unwrap();
-                    if code_size < 5 {
-                        nop(cb, 5 - code_size);
+                Insn::PadInvalPatch => {
+                    let code_size = cb.get_write_pos().saturating_sub(std::cmp::max(start_write_pos, cb.page_start_pos()));
+                    if code_size < JMP_PTR_BYTES {
+                        nop(cb, (JMP_PTR_BYTES - code_size) as u32);
                    }
                }

@ -666,6 +675,15 @@ impl Assembler
                #[allow(unreachable_patterns)]
                _ => panic!("unsupported instruction passed to x86 backend: {:?}", insn)
            };
+
+            // On failure, jump to the next page and retry the current insn
+            if !had_dropped_bytes && cb.has_dropped_bytes() && cb.next_page(src_ptr, jmp_ptr) {
+                // Reset cb states before retrying the current Insn
+                cb.set_label_state(old_label_state);
+            } else {
+                insns_idx += 1;
+                gc_offsets.append(&mut insn_gc_offsets);
+            }
        }

        gc_offsets
@ -684,7 +702,9 @@ impl Assembler

        let gc_offsets = asm.x86_emit(cb);

-        if !cb.has_dropped_bytes() {
+        if cb.has_dropped_bytes() {
+            cb.clear_labels();
+        } else {
            cb.link_labels();
        }

--- a/yjit/src/codegen.rs
+++ b/yjit/src/codegen.rs
@ -13,13 +13,15 @@ use crate::utils::*;
 use CodegenStatus::*;
 use InsnOpnd::*;

-
+use std::cell::RefCell;
+use std::cell::RefMut;
 use std::cmp;
 use std::collections::HashMap;
 use std::ffi::CStr;
 use std::mem::{self, size_of};
 use std::os::raw::c_uint;
 use std::ptr;
+use std::rc::Rc;
 use std::slice;

 pub use crate::virtualmem::CodePtr;
@ -296,6 +298,7 @@ fn jit_prepare_routine_call(
 /// Record the current codeblock write position for rewriting into a jump into
 /// the outlined block later. Used to implement global code invalidation.
 fn record_global_inval_patch(asm: &mut Assembler, outline_block_target_pos: CodePtr) {
+    asm.pad_inval_patch();
    asm.pos_marker(move |code_ptr| {
        CodegenGlobals::push_global_inval_patch(code_ptr, outline_block_target_pos);
    });
@ -606,19 +609,6 @@ fn gen_pc_guard(asm: &mut Assembler, iseq: IseqPtr, insn_idx: u32) {
 /// Compile an interpreter entry block to be inserted into an iseq
 /// Returns None if compilation fails.
 pub fn gen_entry_prologue(cb: &mut CodeBlock, iseq: IseqPtr, insn_idx: u32) -> Option<CodePtr> {
-    const MAX_PROLOGUE_SIZE: usize = 1024;
-
-    // Check if we have enough executable memory
-    if !cb.has_capacity(MAX_PROLOGUE_SIZE) {
-        return None;
-    }
-
-    let old_write_pos = cb.get_write_pos();
-
-    // TODO: figure out if this is actually beneficial for performance
-    // Align the current write position to cache line boundaries
-    cb.align_pos(64);
-
    let code_ptr = cb.get_write_ptr();

    let mut asm = Assembler::new();
@ -660,10 +650,11 @@ pub fn gen_entry_prologue(cb: &mut CodeBlock, iseq: IseqPtr, insn_idx: u32) -> O

    asm.compile(cb);

-    // Verify MAX_PROLOGUE_SIZE
-    assert!(cb.get_write_pos() - old_write_pos <= MAX_PROLOGUE_SIZE);
-
-    return Some(code_ptr);
+    if (cb.has_dropped_bytes()) {
+        None
+    } else {
+        Some(code_ptr)
+    }
 }

 // Generate code to check for interrupts and take a side-exit.
@ -853,7 +844,7 @@ pub fn gen_single_block(
    {
        let mut block = jit.block.borrow_mut();
        if block.entry_exit.is_some() {
-            asm.pad_entry_exit();
+            asm.pad_inval_patch();
        }

        // Compile code into the code block
@ -6544,29 +6535,13 @@ static mut CODEGEN_GLOBALS: Option<CodegenGlobals> = None;
 impl CodegenGlobals {
    /// Initialize the codegen globals
    pub fn init() {
-        // Executable memory size in MiB
-        let mem_size = get_option!(exec_mem_size) * 1024 * 1024;
+        // Executable memory and code page size in bytes
+        let mem_size = get_option!(exec_mem_size);
+        let code_page_size = get_option!(code_page_size);

        #[cfg(not(test))]
        let (mut cb, mut ocb) = {
-            // TODO(alan): we can error more gracefully when the user gives
-            //   --yjit-exec-mem=absurdly-large-number
-            //
-            // 2 GiB. It's likely a bug if we generate this much code.
-            const MAX_BUFFER_SIZE: usize = 2 * 1024 * 1024 * 1024;
-            assert!(mem_size <= MAX_BUFFER_SIZE);
-            let mem_size_u32 = mem_size as u32;
-            let half_size = mem_size / 2;
-
-            let page_size = unsafe { rb_yjit_get_page_size() };
-            let assert_page_aligned = |ptr| assert_eq!(
-                0,
-                ptr as usize % page_size.as_usize(),
-                "Start of virtual address block should be page-aligned",
-            );
-
-            let virt_block: *mut u8 = unsafe { rb_yjit_reserve_addr_space(mem_size_u32) };
-            let second_half = virt_block.wrapping_add(half_size);
+            let virt_block: *mut u8 = unsafe { rb_yjit_reserve_addr_space(mem_size as u32) };

            // Memory protection syscalls need page-aligned addresses, so check it here. Assuming
            // `virt_block` is page-aligned, `second_half` should be page-aligned as long as the
@ -6575,26 +6550,25 @@ impl CodegenGlobals {
            //
            // Basically, we don't support x86-64 2MiB and 1GiB pages. ARMv8 can do up to 64KiB
            // (2¹⁶ bytes) pages, which should be fine. 4KiB pages seem to be the most popular though.
-            assert_page_aligned(virt_block);
-            assert_page_aligned(second_half);
+            let page_size = unsafe { rb_yjit_get_page_size() };
+            assert_eq!(
+                virt_block as usize % page_size.as_usize(), 0,
+                "Start of virtual address block should be page-aligned",
+            );
+            assert_eq!(code_page_size % page_size.as_usize(), 0, "code_page_size was not page-aligned");

            use crate::virtualmem::*;

-            let first_half = VirtualMem::new(
+            let mem_block = VirtualMem::new(
                SystemAllocator {},
                page_size,
                virt_block,
-                half_size
-            );
-            let second_half = VirtualMem::new(
-                SystemAllocator {},
-                page_size,
-                second_half,
-                half_size
+                mem_size,
            );
+            let mem_block = Rc::new(RefCell::new(mem_block));

-            let cb = CodeBlock::new(first_half, false);
-            let ocb = OutlinedCb::wrap(CodeBlock::new(second_half, true));
+            let cb = CodeBlock::new(mem_block.clone(), code_page_size, false);
+            let ocb = OutlinedCb::wrap(CodeBlock::new(mem_block, code_page_size, true));

            (cb, ocb)
        };
@ -6702,6 +6676,10 @@ impl CodegenGlobals {
        unsafe { CODEGEN_GLOBALS.as_mut().unwrap() }
    }

+    pub fn has_instance() -> bool {
+        unsafe { CODEGEN_GLOBALS.as_mut().is_some() }
+    }
+
    /// Get a mutable reference to the inline code block
    pub fn get_inline_cb() -> &'static mut CodeBlock {
        &mut CodegenGlobals::get_instance().inline_cb
--- a/yjit/src/core.rs
+++ b/yjit/src/core.rs
@ -665,7 +665,7 @@ pub extern "C" fn rb_yjit_iseq_update_references(payload: *mut c_void) {
                if new_addr != object {
                    for (byte_idx, &byte) in new_addr.as_u64().to_le_bytes().iter().enumerate() {
                        let byte_code_ptr = value_code_ptr.add_bytes(byte_idx);
-                        cb.get_mem().write_byte(byte_code_ptr, byte)
+                        cb.write_mem(byte_code_ptr, byte)
                            .expect("patching existing code should be within bounds");
                    }
                }
@ -1916,7 +1916,9 @@ pub fn gen_branch(

    // Call the branch generation function
    asm.mark_branch_start(&branchref);
-    gen_fn(asm, branch.dst_addrs[0].unwrap(), branch.dst_addrs[1], BranchShape::Default);
+    if let Some(dst_addr) = branch.dst_addrs[0] {
+        gen_fn(asm, dst_addr, branch.dst_addrs[1], BranchShape::Default);
+    }
    asm.mark_branch_end(&branchref);
 }

@ -1955,6 +1957,7 @@ pub fn gen_direct_jump(jit: &JITState, ctx: &Context, target0: BlockId, asm: &mu
        branch.shape = BranchShape::Default;

        // Call the branch generation function
+        asm.comment("gen_direct_jmp: existing block");
        asm.mark_branch_start(&branchref);
        gen_jump_branch(asm, branch.dst_addrs[0].unwrap(), None, BranchShape::Default);
        asm.mark_branch_end(&branchref);
@ -1965,6 +1968,7 @@ pub fn gen_direct_jump(jit: &JITState, ctx: &Context, target0: BlockId, asm: &mu
        branch.shape = BranchShape::Next0;

        // The branch is effectively empty (a noop)
+        asm.comment("gen_direct_jmp: fallthrough");
        asm.mark_branch_start(&branchref);
        asm.mark_branch_end(&branchref);
    }
@ -2003,7 +2007,9 @@ pub fn defer_compilation(

    // Call the branch generation function
    asm.mark_branch_start(&branch_rc);
-    gen_jump_branch(asm, branch.dst_addrs[0].unwrap(), None, BranchShape::Default);
+    if let Some(dst_addr) = branch.dst_addrs[0] {
+        gen_jump_branch(asm, dst_addr, None, BranchShape::Default);
+    }
    asm.mark_branch_end(&branch_rc);
 }

--- a/yjit/src/options.rs
+++ b/yjit/src/options.rs
@ -4,9 +4,14 @@ use std::ffi::CStr;
 #[derive(Clone, PartialEq, Eq, Debug)]
 #[repr(C)]
 pub struct Options {
-    // Size of the executable memory block to allocate in MiB
+    // Size of the executable memory block to allocate in bytes
+    // Note that the command line argument is expressed in MiB and not bytes
    pub exec_mem_size: usize,

+    // Size of each executable memory code page in bytes
+    // Note that the command line argument is expressed in KiB and not bytes
+    pub code_page_size: usize,
+
    // Number of method calls after which to start generating code
    // Threshold==1 means compile on first execution
    pub call_threshold: usize,
@ -48,7 +53,8 @@ pub struct Options {

 // Initialize the options to default values
 pub static mut OPTIONS: Options = Options {
-    exec_mem_size: 256,
+    exec_mem_size: 256 * 1024 * 1024,
+    code_page_size: 16 * 1024,
    call_threshold: 10,
    greedy_versioning: false,
    no_type_prop: false,
@ -118,8 +124,30 @@ pub fn parse_option(str_ptr: *const std::os::raw::c_char) -> Option<()> {
    match (opt_name, opt_val) {
        ("", "") => (), // Simply --yjit

-        ("exec-mem-size", _) => match opt_val.parse() {
-            Ok(n) => unsafe { OPTIONS.exec_mem_size = n },
+        ("exec-mem-size", _) => match opt_val.parse::<usize>() {
+            Ok(n) => {
+                if n == 0 || n > 2 * 1024 * 1024 {
+                    return None
+                }
+
+                // Convert from MiB to bytes internally for convenience
+                unsafe { OPTIONS.exec_mem_size = n * 1024 * 1024 }
+            }
+            Err(_) => {
+                return None;
+            }
+        },
+
+        ("code-page-size", _) => match opt_val.parse::<usize>() {
+            Ok(n) => {
+                // Enforce bounds checks and that n is divisible by 4KiB
+                if n < 4 || n > 256 || n % 4 != 0 {
+                    return None
+                }
+
+                // Convert from KiB to bytes internally for convenience
+                unsafe { OPTIONS.code_page_size = n * 1024 }
+            }
            Err(_) => {
                return None;
            }
--- a/yjit/src/utils.rs
+++ b/yjit/src/utils.rs
@ -74,14 +74,13 @@ pub(crate) use offset_of;
 // This should work fine on ASCII strings and anything else
 // that is considered legal UTF-8, including embedded nulls.
 fn ruby_str_to_rust(v: VALUE) -> String {
-    // Make sure the CRuby encoding is UTF-8 compatible
-    let encoding = unsafe { rb_ENCODING_GET(v) } as u32;
-    assert!(encoding == RUBY_ENCINDEX_ASCII_8BIT || encoding == RUBY_ENCINDEX_UTF_8 || encoding == RUBY_ENCINDEX_US_ASCII);
-
    let str_ptr = unsafe { rb_RSTRING_PTR(v) } as *mut u8;
    let str_len: usize = unsafe { rb_RSTRING_LEN(v) }.try_into().unwrap();
    let str_slice: &[u8] = unsafe { slice::from_raw_parts(str_ptr, str_len) };
-    String::from_utf8(str_slice.to_vec()).unwrap() // does utf8 validation
+    match String::from_utf8(str_slice.to_vec()) {
+        Ok(utf8) => utf8,
+        Err(_) => String::new(),
+    }
 }

 // Location is the file defining the method, colon, method name.