YJIT: implement variable-length context encoding scheme (#10888)

* Implement BitVector data structure for variable-length context encoding * Rename method to make intent clearer * Rename write_uint => push_uint to make intent clearer * Implement debug trait for BitVector * Fix bug in BitVector::read_uint_at(), enable more tests * Add one more test for good measure * Start sketching Context::encode() * Progress on variable length context encoding * Add tests. Fix bug. * Encode stack state * Add comments. Try to estimate context encoding size. * More compact encoding for stack size * Commit before rebase * Change Context::encode() to take a BitVector as input * Refactor BitVector::read_uint(), add helper read functions * Implement Context::decode() function. Add test. * Fix bug, add tests * Rename methods * Add Context::encode() and decode() methods using global data * Make encode and decode methods use u32 indices * Refactor YJIT to use variable-length context encoding * Tag functions as allow unused * Add a simple caching mechanism and stats for bytes per context etc * Add comments, fix formatting * Grow vector of bytes by 1.2x instead of 2x * Add debug assert to check round-trip encoding-decoding * Take some rustfmt formatting * Add decoded_from field to Context to reuse previous encodings * Remove olde context stats * Re-add stack_size assert * Disable decoded_from optimization for now
2024-06-07 16:26:14 -04:00 · 2024-06-07 16:26:14 -04:00 · 425e630ce7
commit 425e630ce7
parent faad2bc6e1
5 changed files with 623 additions and 66 deletions
--- a/yjit.c
+++ b/yjit.c
@ -1245,7 +1245,7 @@ rb_yjit_set_exception_return(rb_control_frame_t *cfp, void *leave_exit, void *le
 VALUE rb_yjit_stats_enabled_p(rb_execution_context_t *ec, VALUE self);
 VALUE rb_yjit_print_stats_p(rb_execution_context_t *ec, VALUE self);
 VALUE rb_yjit_trace_exit_locations_enabled_p(rb_execution_context_t *ec, VALUE self);
-VALUE rb_yjit_get_stats(rb_execution_context_t *ec, VALUE self, VALUE context);
+VALUE rb_yjit_get_stats(rb_execution_context_t *ec, VALUE self);
 VALUE rb_yjit_reset_stats_bang(rb_execution_context_t *ec, VALUE self);
 VALUE rb_yjit_disasm_iseq(rb_execution_context_t *ec, VALUE self, VALUE iseq);
 VALUE rb_yjit_insns_compiled(rb_execution_context_t *ec, VALUE self, VALUE iseq);
--- a/yjit.rb
+++ b/yjit.rb
@ -155,8 +155,8 @@ module RubyVM::YJIT

  # Return a hash for statistics generated for the `--yjit-stats` command line option.
  # Return `nil` when option is not passed or unavailable.
-  def self.runtime_stats(context: false)
-    stats = Primitive.rb_yjit_get_stats(context)
+  def self.runtime_stats()
+    stats = Primitive.rb_yjit_get_stats()
    return stats if stats.nil?

    stats[:object_shape_count] = Primitive.object_shape_count
@ -313,7 +313,7 @@ module RubyVM::YJIT

    # Format and print out counters
    def _print_stats(out: $stderr) # :nodoc:
-      stats = runtime_stats(context: true)
+      stats = runtime_stats()
      return unless Primitive.rb_yjit_stats_enabled_p

      out.puts("***YJIT: Printing YJIT statistics on exit***")
@ -388,8 +388,12 @@ module RubyVM::YJIT

      out.puts "freed_code_size:       " + format_number(13, stats[:freed_code_size])
      out.puts "yjit_alloc_size:       " + format_number(13, stats[:yjit_alloc_size]) if stats.key?(:yjit_alloc_size)
-      out.puts "live_context_size:     " + format_number(13, stats[:live_context_size])
-      out.puts "live_context_count:    " + format_number(13, stats[:live_context_count])
+
+      bytes_per_context = stats[:context_data_bytes].fdiv(stats[:num_contexts_encoded])
+      out.puts "context_data_bytes:    " + format_number(13, stats[:context_data_bytes])
+      out.puts "num_contexts_encoded:  " + format_number(13, stats[:num_contexts_encoded])
+      out.puts "bytes_per_context:     " + ("%13.2f" % bytes_per_context)
+
      out.puts "live_page_count:       " + format_number(13, stats[:live_page_count])
      out.puts "freed_page_count:      " + format_number(13, stats[:freed_page_count])
      out.puts "code_gc_count:         " + format_number(13, stats[:code_gc_count])
--- a/yjit/src/codegen.rs
+++ b/yjit/src/codegen.rs
@ -5789,7 +5789,7 @@ fn jit_rb_str_getbyte(
        RUBY_OFFSET_RSTRING_LEN as i32,
    );

-    // Exit if the indes is out of bounds
+    // Exit if the index is out of bounds
    asm.cmp(idx, str_len_opnd);
    asm.jge(Target::side_exit(Counter::getbyte_idx_out_of_bounds));

@ -10333,6 +10333,9 @@ fn yjit_reg_method(klass: VALUE, mid_str: &str, gen_fn: MethodGenFn) {

 /// Global state needed for code generation
 pub struct CodegenGlobals {
+    /// Flat vector of bits to store compressed context data
+    context_data: BitVector,
+
    /// Inline code block (fast path)
    inline_cb: CodeBlock,

@ -10448,6 +10451,7 @@ impl CodegenGlobals {
        ocb.unwrap().mark_all_executable();

        let codegen_globals = CodegenGlobals {
+            context_data: BitVector::new(),
            inline_cb: cb,
            outlined_cb: ocb,
            leave_exit_code,
@ -10476,6 +10480,11 @@ impl CodegenGlobals {
        unsafe { CODEGEN_GLOBALS.as_mut().is_some() }
    }

+    /// Get a mutable reference to the context data
+    pub fn get_context_data() -> &'static mut BitVector {
+        &mut CodegenGlobals::get_instance().context_data
+    }
+
    /// Get a mutable reference to the inline code block
    pub fn get_inline_cb() -> &'static mut CodeBlock {
        &mut CodegenGlobals::get_instance().inline_cb
--- a/yjit/src/core.rs
+++ b/yjit/src/core.rs
@ -457,8 +457,13 @@ const CHAIN_DEPTH_MASK: u8   = 0b00111111; // 63
 /// Contains information we can use to specialize/optimize code
 /// There are a lot of context objects so we try to keep the size small.
 #[derive(Copy, Clone, Default, Eq, Hash, PartialEq, Debug)]
-#[repr(packed)]
 pub struct Context {
+    // FIXME: decoded_from breaks == on contexts
+    /*
+    // Offset at which this context was previously encoded (zero if not)
+    decoded_from: u32,
+    */
+
    // Number of values currently on the temporary stack
    stack_size: u8,

@ -498,6 +503,568 @@ pub struct Context {
    inline_block: u64,
 }

+#[derive(Clone)]
+pub struct BitVector {
+    // Flat vector of bytes to write into
+    bytes: Vec<u8>,
+
+    // Number of bits taken out of bytes allocated
+    num_bits: usize,
+}
+
+impl BitVector {
+    pub fn new() -> Self {
+        Self {
+            bytes: Vec::with_capacity(4096),
+            num_bits: 0,
+        }
+    }
+
+    #[allow(unused)]
+    pub fn num_bits(&self) -> usize {
+        self.num_bits
+    }
+
+    // Total number of bytes taken
+    #[allow(unused)]
+    pub fn num_bytes(&self) -> usize {
+        (self.num_bits / 8) + if (self.num_bits % 8) != 0 { 1 } else { 0 }
+    }
+
+    // Write/append an unsigned integer value
+    fn push_uint(&mut self, mut val: u64, mut num_bits: usize) {
+        assert!(num_bits <= 64);
+
+        // Mask out bits above the number of bits requested
+        let mut val_bits = val;
+        if num_bits < 64 {
+            val_bits &= (1 << num_bits) - 1;
+            assert!(val == val_bits);
+        }
+
+        // Number of bits encoded in the last byte
+        let rem_bits = self.num_bits % 8;
+
+        // Encode as many bits as we can in this last byte
+        if rem_bits != 0 {
+            let num_enc = std::cmp::min(num_bits, 8 - rem_bits);
+            let bit_mask = (1 << num_enc) - 1;
+            let frac_bits = (val & bit_mask) << rem_bits;
+            let frac_bits: u8 = frac_bits.try_into().unwrap();
+            let last_byte_idx = self.bytes.len() - 1;
+            self.bytes[last_byte_idx] |= frac_bits;
+
+            self.num_bits += num_enc;
+            num_bits -= num_enc;
+            val >>= num_enc;
+        }
+
+        // While we have bits left to encode
+        while num_bits > 0 {
+            // Grow with a 1.2x growth factor instead of 2x
+            assert!(self.num_bits % 8 == 0);
+            let num_bytes = self.num_bits / 8;
+            if num_bytes == self.bytes.capacity() {
+                self.bytes.reserve_exact(self.bytes.len() / 5);
+            }
+
+            let bits = val & 0xFF;
+            let bits: u8 = bits.try_into().unwrap();
+            self.bytes.push(bits);
+
+            let bits_to_encode = std::cmp::min(num_bits, 8);
+            self.num_bits += bits_to_encode;
+            num_bits -= bits_to_encode;
+            val >>= bits_to_encode;
+        }
+    }
+
+    fn push_u8(&mut self, val: u8) {
+        self.push_uint(val as u64, 8);
+    }
+
+    fn push_u4(&mut self, val: u8) {
+        assert!(val < 16);
+        self.push_uint(val as u64, 4);
+    }
+
+    fn push_u3(&mut self, val: u8) {
+        assert!(val < 8);
+        self.push_uint(val as u64, 3);
+    }
+
+    fn push_u2(&mut self, val: u8) {
+        assert!(val < 4);
+        self.push_uint(val as u64, 2);
+    }
+
+    fn push_u1(&mut self, val: u8) {
+        assert!(val < 2);
+        self.push_uint(val as u64, 1);
+    }
+
+    // Push a context encoding opcode
+    fn push_op(&mut self, op: CtxOp) {
+        self.push_u4(op as u8);
+    }
+
+    // Read a uint value at a given bit index
+    // The bit index is incremented after the value is read
+    fn read_uint(&self, bit_idx: &mut usize, mut num_bits: usize) -> u64 {
+        let start_bit_idx = *bit_idx;
+        let mut cur_idx = *bit_idx;
+
+        // Read the bits in the first byte
+        let bit_mod = cur_idx % 8;
+        let bits_in_byte = self.bytes[cur_idx / 8] >> bit_mod;
+
+        let num_bits_in_byte = std::cmp::min(num_bits, 8 - bit_mod);
+        cur_idx += num_bits_in_byte;
+        num_bits -= num_bits_in_byte;
+
+        let mut out_bits = (bits_in_byte as u64) & ((1 << num_bits_in_byte) - 1);
+
+        // While we have bits left to read
+        while num_bits > 0 {
+            let num_bits_in_byte = std::cmp::min(num_bits, 8);
+            assert!(cur_idx % 8 == 0);
+            let byte = self.bytes[cur_idx / 8] as u64;
+
+            let bits_in_byte = byte & ((1 << num_bits) - 1);
+            out_bits |= bits_in_byte << (cur_idx - start_bit_idx);
+
+            // Move to the next byte/offset
+            cur_idx += num_bits_in_byte;
+            num_bits -= num_bits_in_byte;
+        }
+
+        // Update the read index
+        *bit_idx = cur_idx;
+
+        out_bits
+    }
+
+    fn read_u8(&self, bit_idx: &mut usize) -> u8 {
+        self.read_uint(bit_idx, 8) as u8
+    }
+
+    fn read_u4(&self, bit_idx: &mut usize) -> u8 {
+        self.read_uint(bit_idx, 4) as u8
+    }
+
+    fn read_u3(&self, bit_idx: &mut usize) -> u8 {
+        self.read_uint(bit_idx, 3) as u8
+    }
+
+    fn read_u2(&self, bit_idx: &mut usize) -> u8 {
+        self.read_uint(bit_idx, 2) as u8
+    }
+
+    fn read_u1(&self, bit_idx: &mut usize) -> u8 {
+        self.read_uint(bit_idx, 1) as u8
+    }
+
+    fn read_op(&self, bit_idx: &mut usize) -> CtxOp {
+        unsafe { std::mem::transmute(self.read_u4(bit_idx)) }
+    }
+}
+
+impl fmt::Debug for BitVector {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        // We print the higher bytes first
+        for (idx, byte) in self.bytes.iter().enumerate().rev() {
+            write!(f, "{:08b}", byte)?;
+
+            // Insert a separator between each byte
+            if idx > 0 {
+                write!(f, "|")?;
+            }
+        }
+
+        Ok(())
+    }
+}
+
+#[cfg(test)]
+mod bitvector_tests {
+    use super::*;
+
+    #[test]
+    fn write_3() {
+        let mut arr = BitVector::new();
+        arr.push_uint(3, 2);
+        assert!(arr.read_uint(&mut 0, 2) == 3);
+    }
+
+    #[test]
+    fn write_11() {
+        let mut arr = BitVector::new();
+        arr.push_uint(1, 1);
+        arr.push_uint(1, 1);
+        assert!(arr.read_uint(&mut 0, 2) == 3);
+    }
+
+    #[test]
+    fn write_11_overlap() {
+        let mut arr = BitVector::new();
+        arr.push_uint(0, 7);
+        arr.push_uint(3, 2);
+        arr.push_uint(1, 1);
+
+        //dbg!(arr.read_uint(7, 2));
+        assert!(arr.read_uint(&mut 7, 2) == 3);
+    }
+
+    #[test]
+    fn write_ff_0() {
+        let mut arr = BitVector::new();
+        arr.push_uint(0xFF, 8);
+        assert!(arr.read_uint(&mut 0, 8) == 0xFF);
+    }
+
+    #[test]
+    fn write_ff_3() {
+        // Write 0xFF at bit index 3
+        let mut arr = BitVector::new();
+        arr.push_uint(0, 3);
+        arr.push_uint(0xFF, 8);
+        assert!(arr.read_uint(&mut 3, 8) == 0xFF);
+    }
+
+    #[test]
+    fn write_ff_sandwich() {
+        // Write 0xFF sandwiched between zeros
+        let mut arr = BitVector::new();
+        arr.push_uint(0, 3);
+        arr.push_u8(0xFF);
+        arr.push_uint(0, 3);
+        assert!(arr.read_uint(&mut 3, 8) == 0xFF);
+    }
+
+    #[test]
+    fn write_read_u32_max() {
+        let mut arr = BitVector::new();
+        arr.push_uint(0xFF_FF_FF_FF, 32);
+        assert!(arr.read_uint(&mut 0, 32) == 0xFF_FF_FF_FF);
+    }
+
+    #[test]
+    fn write_read_u32_max_64b() {
+        let mut arr = BitVector::new();
+        arr.push_uint(0xFF_FF_FF_FF, 64);
+        assert!(arr.read_uint(&mut 0, 64) == 0xFF_FF_FF_FF);
+    }
+
+    #[test]
+    fn write_read_u64_max() {
+        let mut arr = BitVector::new();
+        arr.push_uint(u64::MAX, 64);
+        assert!(arr.read_uint(&mut 0, 64) == u64::MAX);
+    }
+
+    #[test]
+    fn encode_default() {
+        let mut bits = BitVector::new();
+        let ctx = Context::default();
+        let start_idx = ctx.encode_into(&mut bits);
+        assert!(start_idx == 0);
+        assert!(bits.num_bits() > 0);
+        assert!(bits.num_bytes() > 0);
+
+        // Make sure that the round trip matches the input
+        let ctx2 = Context::decode_from(&bits, 0);
+        assert!(ctx2 == ctx);
+    }
+
+    #[test]
+    fn encode_default_2x() {
+        let mut bits = BitVector::new();
+
+        let ctx0 = Context::default();
+        let idx0 = ctx0.encode_into(&mut bits);
+
+        let mut ctx1 = Context::default();
+        ctx1.reg_temps = RegTemps(1);
+        let idx1 = ctx1.encode_into(&mut bits);
+
+        // Make sure that we can encode two contexts successively
+        let ctx0_dec = Context::decode_from(&bits, idx0);
+        let ctx1_dec = Context::decode_from(&bits, idx1);
+        assert!(ctx0_dec == ctx0);
+        assert!(ctx1_dec == ctx1);
+    }
+
+    #[test]
+    fn regress_reg_temps() {
+        let mut bits = BitVector::new();
+        let mut ctx = Context::default();
+        ctx.reg_temps = RegTemps(1);
+        ctx.encode_into(&mut bits);
+
+        let b0 = bits.read_u1(&mut 0);
+        assert!(b0 == 1);
+
+        // Make sure that the round trip matches the input
+        let ctx2 = Context::decode_from(&bits, 0);
+        assert!(ctx2 == ctx);
+    }
+}
+
+// Context encoding opcodes (4 bits)
+#[derive(Debug, Copy, Clone)]
+#[repr(u8)]
+enum CtxOp {
+    // Self type (4 bits)
+    SetSelfType = 0,
+
+    // Local idx (3 bits), temp type (4 bits)
+    SetLocalType,
+
+    // Map stack temp to self with known type
+    // Temp idx (3 bits), known type (4 bits)
+    SetTempType,
+
+    // Map stack temp to a local variable
+    // Temp idx (3 bits), local idx (3 bits)
+    MapTempLocal,
+
+    // Map a stack temp to self
+    // Temp idx (3 bits)
+    MapTempSelf,
+
+    // Set inline block pointer	(8 bytes)
+    SetInlineBlock,
+
+    // End of encoding
+    EndOfCode,
+}
+
+// Cache of the last context encoded
+// Empirically this saves a few percent of memory
+// We can experiment with varying the size of this cache
+static mut LAST_CTX_ENCODED: Option<(Context, u32)> = None;
+
+impl Context {
+    pub fn encode(&self) -> u32 {
+        incr_counter!(num_contexts_encoded);
+
+        if *self == Context::default() {
+            return 0;
+        }
+
+        /*
+        // If this context was previously decoded and was not changed since
+        if self.decoded_from != 0 && Self::decode(self.decoded_from) == *self {
+            return self.decoded_from;
+        }
+        */
+
+        // If this context was recently encoded (cache check)
+        unsafe {
+            if let Some((ctx, idx)) = LAST_CTX_ENCODED {
+                if ctx == *self {
+                    return idx;
+                }
+            }
+        }
+
+        let context_data = CodegenGlobals::get_context_data();
+
+        // Offset 0 is reserved for the default context
+        if context_data.num_bits() == 0 {
+            context_data.push_u1(0);
+        }
+
+        let idx = self.encode_into(context_data);
+        let idx: u32 = idx.try_into().unwrap();
+
+        unsafe {
+            LAST_CTX_ENCODED = Some((*self, idx));
+        }
+
+        // In debug mode, check that the round-trip decoding always matches
+        debug_assert!(Self::decode(idx) == *self);
+
+        idx
+    }
+
+    pub fn decode(start_idx: u32) -> Context {
+        if start_idx == 0 {
+            return Context::default();
+        };
+
+        let context_data = CodegenGlobals::get_context_data();
+        let ctx = Self::decode_from(context_data, start_idx as usize);
+
+        // Keep track of the fact that this context was previously encoded
+        //ctx.decoded_from = start_idx;
+
+        ctx
+    }
+
+    // Encode into a compressed context representation in a bit vector
+    fn encode_into(&self, bits: &mut BitVector) -> usize {
+        let start_idx = bits.num_bits();
+
+        // NOTE: this value is often zero or falls within
+        // a small range, so could be compressed
+        //println!("stack_size={}", self.stack_size);
+        //println!("sp_offset={}", self.sp_offset);
+        //println!("chain_depth_and_flags={}", self.chain_depth_and_flags);
+
+        // Most of the time, the stack size is small and sp offset has the same value
+        if (self.stack_size as i64) == (self.sp_offset as i64) && self.stack_size < 4 {
+            // One single bit to signify a compact stack_size/sp_offset encoding
+            bits.push_u1(1);
+            bits.push_u2(self.stack_size);
+        } else {
+            // Full stack size encoding
+            bits.push_u1(0);
+
+            // Number of values currently on the temporary stack
+            bits.push_u8(self.stack_size);
+
+            // sp_offset: i8,
+            bits.push_u8(self.sp_offset as u8);
+        }
+
+        // Bitmap of which stack temps are in a register
+        let RegTemps(reg_temps) = self.reg_temps;
+        bits.push_u8(reg_temps);
+
+        // chain_depth_and_flags: u8,
+        bits.push_u8(self.chain_depth_and_flags);
+
+        // Encode the self type if known
+        if self.self_type != Type::Unknown {
+            bits.push_op(CtxOp::SetSelfType);
+            bits.push_u4(self.self_type as u8);
+        }
+
+        // Encode the local types if known
+        for local_idx in 0..MAX_LOCAL_TYPES {
+            let t = self.get_local_type(local_idx);
+            if t != Type::Unknown {
+                bits.push_op(CtxOp::SetLocalType);
+                bits.push_u3(local_idx as u8);
+                bits.push_u4(t as u8);
+            }
+        }
+
+        // Encode stack temps
+        for stack_idx in 0..MAX_TEMP_TYPES {
+            let mapping = self.get_temp_mapping(stack_idx);
+
+            match mapping.get_kind() {
+                MapToStack => {
+                    let t = mapping.get_type();
+                    if t != Type::Unknown {
+                        // Temp idx (3 bits), known type (4 bits)
+                        bits.push_op(CtxOp::SetTempType);
+                        bits.push_u3(stack_idx as u8);
+                        bits.push_u4(t as u8);
+                    }
+                }
+
+                MapToLocal => {
+                    // Temp idx (3 bits), local idx (3 bits)
+                    let local_idx = mapping.get_local_idx();
+                    bits.push_op(CtxOp::MapTempLocal);
+                    bits.push_u3(stack_idx as u8);
+                    bits.push_u3(local_idx as u8);
+                }
+
+                MapToSelf => {
+                    // Temp idx (3 bits)
+                    bits.push_op(CtxOp::MapTempSelf);
+                    bits.push_u3(stack_idx as u8);
+                }
+            }
+        }
+
+        // Inline block pointer
+        if self.inline_block != 0 {
+            bits.push_op(CtxOp::SetInlineBlock);
+            bits.push_uint(self.inline_block, 64);
+        }
+
+        // TODO: should we add an op for end-of-encoding,
+        // or store num ops at the beginning?
+        bits.push_op(CtxOp::EndOfCode);
+
+        start_idx
+    }
+
+    // Decode a compressed context representation from a bit vector
+    fn decode_from(bits: &BitVector, start_idx: usize) -> Context {
+        let mut ctx = Context::default();
+
+        let mut idx = start_idx;
+
+        // Small vs large stack size encoding
+        if bits.read_u1(&mut idx) == 1 {
+            ctx.stack_size = bits.read_u2(&mut idx);
+            ctx.sp_offset = ctx.stack_size as i8;
+        } else {
+            ctx.stack_size = bits.read_u8(&mut idx);
+            ctx.sp_offset = bits.read_u8(&mut idx) as i8;
+        }
+
+        // Bitmap of which stack temps are in a register
+        ctx.reg_temps = RegTemps(bits.read_u8(&mut idx));
+
+        // chain_depth_and_flags: u8
+        ctx.chain_depth_and_flags = bits.read_u8(&mut idx);
+
+        loop {
+            //println!("reading op");
+            let op = bits.read_op(&mut idx);
+            //println!("got op {:?}", op);
+
+            match op {
+                CtxOp::SetSelfType => {
+                    ctx.self_type = unsafe { transmute(bits.read_u4(&mut idx)) };
+                }
+
+                CtxOp::SetLocalType => {
+                    let local_idx = bits.read_u3(&mut idx) as usize;
+                    let t = unsafe { transmute(bits.read_u4(&mut idx)) };
+                    ctx.set_local_type(local_idx, t);
+                }
+
+                // Map temp to stack (known type)
+                CtxOp::SetTempType => {
+                    let temp_idx = bits.read_u3(&mut idx) as usize;
+                    let t = unsafe { transmute(bits.read_u4(&mut idx)) };
+                    ctx.set_temp_mapping(temp_idx, TempMapping::map_to_stack(t));
+                }
+
+                // Map temp to local
+                CtxOp::MapTempLocal => {
+                    let temp_idx = bits.read_u3(&mut idx) as usize;
+                    let local_idx = bits.read_u3(&mut idx);
+                    ctx.set_temp_mapping(temp_idx, TempMapping::map_to_local(local_idx));
+                }
+
+                // Map temp to self
+                CtxOp::MapTempSelf => {
+                    let temp_idx = bits.read_u3(&mut idx) as usize;
+                    ctx.set_temp_mapping(temp_idx, TempMapping::map_to_self());
+                }
+
+                // Inline block pointer
+                CtxOp::SetInlineBlock => {
+                    ctx.inline_block = bits.read_uint(&mut idx, 64);
+                }
+
+                CtxOp::EndOfCode => break,
+            }
+        }
+
+        ctx
+    }
+}
+
 /// Tuple of (iseq, idx) used to identify basic blocks
 /// There are a lot of blockid objects so we try to keep the size small.
 #[derive(Copy, Clone, PartialEq, Eq, Debug)]
@ -659,7 +1226,7 @@ impl BranchTarget {
        }
    }

-    fn get_ctx(&self) -> Context {
+    fn get_ctx(&self) -> u32 {
        match self {
            BranchTarget::Stub(stub) => stub.ctx,
            BranchTarget::Block(blockref) => unsafe { blockref.as_ref() }.ctx,
@ -686,7 +1253,7 @@ struct BranchStub {
    address: Option<CodePtr>,
    iseq: Cell<IseqPtr>,
    iseq_idx: IseqIdx,
-    ctx: Context,
+    ctx: u32,
 }

 /// Store info about an outgoing branch in a code segment
@ -808,6 +1375,9 @@ impl PendingBranch {
            return Some(block.start_addr);
        }

+        // Compress/encode the context
+        let ctx = Context::encode(ctx);
+
        // The branch struct is uninitialized right now but as a stable address.
        // We make sure the stub runs after the branch is initialized.
        let branch_struct_addr = self.uninit_branch.as_ptr() as usize;
@ -819,7 +1389,7 @@ impl PendingBranch {
                address: Some(stub_addr),
                iseq: Cell::new(target.iseq),
                iseq_idx: target.idx,
-                ctx: *ctx,
+                ctx,
            })))));
        }

@ -912,7 +1482,7 @@ pub struct Block {

    // Context at the start of the block
    // This should never be mutated
-    ctx: Context,
+    ctx: u32,

    // Positions where the generated code starts and ends
    start_addr: CodePtr,
@ -1085,15 +1655,6 @@ pub fn for_each_iseq<F: FnMut(IseqPtr)>(mut callback: F) {
    unsafe { rb_yjit_for_each_iseq(Some(callback_wrapper), (&mut data) as *mut _ as *mut c_void) };
 }

-/// Iterate over all ISEQ payloads
-pub fn for_each_iseq_payload<F: FnMut(&IseqPayload)>(mut callback: F) {
-    for_each_iseq(|iseq| {
-        if let Some(iseq_payload) = get_iseq_payload(iseq) {
-            callback(iseq_payload);
-        }
-    });
-}
-
 /// Iterate over all on-stack ISEQs
 pub fn for_each_on_stack_iseq<F: FnMut(IseqPtr)>(mut callback: F) {
    unsafe extern "C" fn callback_wrapper(iseq: IseqPtr, data: *mut c_void) {
@ -1425,13 +1986,17 @@ pub fn take_version_list(blockid: BlockId) -> VersionList {
 fn get_num_versions(blockid: BlockId, inlined: bool) -> usize {
    let insn_idx = blockid.idx.as_usize();
    match get_iseq_payload(blockid.iseq) {
+
+        // FIXME: this counting logic is going to be expensive.
+        // We should avoid it if possible
+
        Some(payload) => {
            payload
                .version_map
                .get(insn_idx)
                .map(|versions| {
                    versions.iter().filter(|&&version|
-                        unsafe { version.as_ref() }.ctx.inline() == inlined
+                        Context::decode(unsafe { version.as_ref() }.ctx).inline() == inlined
                    ).count()
                })
                .unwrap_or(0)
@ -1476,10 +2041,11 @@ fn find_block_version(blockid: BlockId, ctx: &Context) -> Option<BlockRef> {
    // For each version matching the blockid
    for blockref in versions.iter() {
        let block = unsafe { blockref.as_ref() };
+        let block_ctx = Context::decode(block.ctx);

        // Note that we always prefer the first matching
        // version found because of inline-cache chains
-        match ctx.diff(&block.ctx) {
+        match ctx.diff(&block_ctx) {
            TypeDiff::Compatible(diff) if diff < best_diff => {
                best_version = Some(*blockref);
                best_diff = diff;
@ -1561,7 +2127,7 @@ unsafe fn add_block_version(blockref: BlockRef, cb: &CodeBlock) {
    let block = unsafe { blockref.as_ref() };

    // Function entry blocks must have stack size 0
-    assert!(!(block.iseq_range.start == 0 && block.ctx.stack_size > 0));
+    debug_assert!(!(block.iseq_range.start == 0 && Context::decode(block.ctx).stack_size > 0));

    let version_list = get_or_create_version_list(block.get_blockid());

@ -1620,12 +2186,14 @@ impl JITState {

        incr_counter_by!(num_gc_obj_refs, gc_obj_offsets.len());

+        let ctx = Context::encode(&self.get_starting_ctx());
+
        // Make the new block
        let block = MaybeUninit::new(Block {
            start_addr,
            iseq: Cell::new(self.get_iseq()),
            iseq_range: self.get_starting_insn_idx()..end_insn_idx,
-            ctx: self.get_starting_ctx(),
+            ctx,
            end_addr: Cell::new(end_addr),
            incoming: MutableBranchList(Cell::default()),
            gc_obj_offsets: gc_obj_offsets.into_boxed_slice(),
@ -2382,6 +2950,7 @@ fn gen_block_series_body(
        };

        // Generate new block using context from the last branch.
+        let requested_ctx = Context::decode(requested_ctx);
        let result = gen_single_block(requested_blockid, &requested_ctx, ec, cb, ocb);

        // If the block failed to compile
@ -2769,7 +3338,8 @@ fn branch_stub_hit_body(branch_ptr: *const c_void, target_idx: u32, ec: EcPtr) -
            return target.get_address().unwrap().raw_ptr(cb);
        }

-        (target.get_blockid(), target.get_ctx())
+        let target_ctx = Context::decode(target.get_ctx());
+        (target.get_blockid(), target_ctx)
    };

    let (cfp, original_interp_sp) = unsafe {
@ -2906,7 +3476,7 @@ fn branch_stub_hit_body(branch_ptr: *const c_void, target_idx: u32, ec: EcPtr) -
 /// Generate a "stub", a piece of code that calls the compiler back when run.
 /// A piece of code that redeems for more code; a thunk for code.
 fn gen_branch_stub(
-    ctx: &Context,
+    ctx: u32,
    ocb: &mut OutlinedCb,
    branch_struct_address: usize,
    target_idx: u32,
@ -2914,8 +3484,8 @@ fn gen_branch_stub(
    let ocb = ocb.unwrap();

    let mut asm = Assembler::new();
-    asm.ctx = *ctx;
-    asm.set_reg_temps(ctx.reg_temps);
+    asm.ctx = Context::decode(ctx);
+    asm.set_reg_temps(asm.ctx.reg_temps);
    asm_comment!(asm, "branch stub hit");

    if asm.ctx.is_return_landing() {
@ -3112,7 +3682,7 @@ pub fn gen_direct_jump(jit: &mut JITState, ctx: &Context, target0: BlockId, asm:
        // compile the target block right after this one (fallthrough).
        BranchTarget::Stub(Box::new(BranchStub {
            address: None,
-            ctx: *ctx,
+            ctx: Context::encode(ctx),
            iseq: Cell::new(target0.iseq),
            iseq_idx: target0.idx,
        }))
@ -3364,7 +3934,7 @@ pub fn invalidate_block_version(blockref: &BlockRef) {
        }

        // Create a stub for this branch target
-        let stub_addr = gen_branch_stub(&block.ctx, ocb, branchref.as_ptr() as usize, target_idx as u32);
+        let stub_addr = gen_branch_stub(block.ctx, ocb, branchref.as_ptr() as usize, target_idx as u32);

        // In case we were unable to generate a stub (e.g. OOM). Use the block's
        // exit instead of a stub for the block. It's important that we
@ -3546,11 +4116,6 @@ mod tests {
        assert_eq!(t.get_local_idx(), 7);
    }

-    #[test]
-    fn context_size() {
-        assert_eq!(mem::size_of::<Context>(), 23);
-    }
-
    #[test]
    fn types() {
        // Valid src => dst
@ -3695,7 +4260,7 @@ mod tests {
                iseq: Cell::new(ptr::null()),
                iseq_idx: 0,
                address: None,
-                ctx: Context::default(),
+                ctx: 0,
            })))))]
        };
        // For easier soundness reasoning, make sure the reference returned does not out live the
@ -3728,7 +4293,7 @@ mod tests {
            iseq: Cell::new(ptr::null()),
            iseq_idx: 0,
            address: None,
-            ctx: Context::default(),
+            ctx: 0,
        })))));
        // Invalid ISeq; we never dereference it.
        let secret_iseq = NonNull::<rb_iseq_t>::dangling().as_ptr();
--- a/yjit/src/stats.rs
+++ b/yjit/src/stats.rs
@ -10,8 +10,6 @@ use std::time::Instant;
 use std::collections::HashMap;

 use crate::codegen::CodegenGlobals;
-use crate::core::Context;
-use crate::core::for_each_iseq_payload;
 use crate::cruby::*;
 use crate::options::*;
 use crate::yjit::yjit_enabled_p;
@ -557,6 +555,7 @@ make_counters! {
    branch_insn_count,
    branch_known_count,
    max_inline_versions,
+    num_contexts_encoded,

    freed_iseq_count,

@ -641,8 +640,8 @@ pub extern "C" fn rb_yjit_print_stats_p(_ec: EcPtr, _ruby_self: VALUE) -> VALUE
 /// Primitive called in yjit.rb.
 /// Export all YJIT statistics as a Ruby hash.
 #[no_mangle]
-pub extern "C" fn rb_yjit_get_stats(_ec: EcPtr, _ruby_self: VALUE, context: VALUE) -> VALUE {
-    with_vm_lock(src_loc!(), || rb_yjit_gen_stats_dict(context == Qtrue))
+pub extern "C" fn rb_yjit_get_stats(_ec: EcPtr, _ruby_self: VALUE) -> VALUE {
+    with_vm_lock(src_loc!(), || rb_yjit_gen_stats_dict())
 }

 /// Primitive called in yjit.rb
@ -701,7 +700,7 @@ pub extern "C" fn rb_yjit_incr_counter(counter_name: *const std::os::raw::c_char
 }

 /// Export all YJIT statistics as a Ruby hash.
-fn rb_yjit_gen_stats_dict(context: bool) -> VALUE {
+fn rb_yjit_gen_stats_dict() -> VALUE {
    // If YJIT is not enabled, return Qnil
    if !yjit_enabled_p() {
        return Qnil;
@ -744,14 +743,9 @@ fn rb_yjit_gen_stats_dict(context: bool) -> VALUE {
        // Rust global allocations in bytes
        hash_aset_usize!(hash, "yjit_alloc_size", GLOBAL_ALLOCATOR.alloc_size.load(Ordering::SeqCst));

-        // `context` is true at RubyVM::YJIT._print_stats for --yjit-stats. It's false by default
-        // for RubyVM::YJIT.runtime_stats because counting all Contexts could be expensive.
-        if context {
-            let live_context_count = get_live_context_count();
-            let context_size = std::mem::size_of::<Context>();
-            hash_aset_usize!(hash, "live_context_count", live_context_count);
-            hash_aset_usize!(hash, "live_context_size", live_context_count * context_size);
-        }
+        // How many bytes we are using to store context data
+        let context_data = CodegenGlobals::get_context_data();
+        hash_aset_usize!(hash, "context_data_bytes", context_data.num_bytes());

        // VM instructions count
        hash_aset_usize!(hash, "vm_insns_count", rb_vm_insns_count as usize);
@ -846,21 +840,6 @@ fn rb_yjit_gen_stats_dict(context: bool) -> VALUE {
    hash
 }

-fn get_live_context_count() -> usize {
-    let mut count = 0;
-    for_each_iseq_payload(|iseq_payload| {
-        for blocks in iseq_payload.version_map.iter() {
-            for block in blocks.iter() {
-                count += unsafe { block.as_ref() }.get_ctx_count();
-            }
-        }
-        for block in iseq_payload.dead_blocks.iter() {
-            count += unsafe { block.as_ref() }.get_ctx_count();
-        }
-    });
-    count
-}
-
 /// Record the backtrace when a YJIT exit occurs. This functionality requires
 /// that the stats feature is enabled as well as the --yjit-trace-exits option.
 ///