From bc40d0609177cd60ba1bc4fd915e03e917c0b79c Mon Sep 17 00:00:00 2001 From: Martin Emde Date: Thu, 30 May 2024 21:37:24 -0700 Subject: [PATCH] [rubygems/rubygems] String search based parsing of compact index versions This significantly reduces memory usage. https://github.com/rubygems/rubygems/commit/8a76506c90 --- lib/bundler/compact_index_client.rb | 23 ++++++ lib/bundler/compact_index_client/parser.rb | 77 +++++++++++++------ .../compact_index_client/parser_spec.rb | 44 ++++++++++- 3 files changed, 116 insertions(+), 28 deletions(-) diff --git a/lib/bundler/compact_index_client.rb b/lib/bundler/compact_index_client.rb index d53f3ca605..692d68e579 100644 --- a/lib/bundler/compact_index_client.rb +++ b/lib/bundler/compact_index_client.rb @@ -4,6 +4,29 @@ require "pathname" require "set" module Bundler + # The CompactIndexClient is responsible for fetching and parsing the compact index. + # + # The compact index is a set of caching optimized files that are used to fetch gem information. + # The files are: + # - names: a list of all gem names + # - versions: a list of all gem versions + # - info/[gem]: a list of all versions of a gem + # + # The client is instantiated with: + # - `directory`: the root directory where the cache files are stored. + # - `fetcher`: (optional) an object that responds to #call(uri_path, headers) and returns an http response. + # If the `fetcher` is not provided, the client will only read cached files from disk. + # + # The client is organized into: + # - `Updater`: updates the cached files on disk using the fetcher. + # - `Cache`: calls the updater, caches files, read and return them from disk + # - `Parser`: parses the compact index file data + # - `CacheFile`: a concurrency safe file reader/writer that verifies checksums + # + # The client is intended to optimize memory usage and performance. + # It is called 100s or 1000s of times, parsing files with hundreds of thousands of lines. + # It may be called concurrently without global interpreter lock in some Rubies. + # As a result, some methods may look more complex than necessary to save memory or time. class CompactIndexClient # NOTE: MD5 is here not because we expect a server to respond with it, but # because we use it to generate the etag on first request during the upgrade diff --git a/lib/bundler/compact_index_client/parser.rb b/lib/bundler/compact_index_client/parser.rb index a227bc2cfd..3a0dec4907 100644 --- a/lib/bundler/compact_index_client/parser.rb +++ b/lib/bundler/compact_index_client/parser.rb @@ -10,6 +10,8 @@ module Bundler @info_checksums = nil @versions_by_name = nil @available = nil + @gem_parser = nil + @versions_data = nil end def names @@ -38,46 +40,71 @@ module Bundler end def info(name) - data = @compact_index.info(name, info_checksums[name]) + data = @compact_index.info(name, info_checksum(name)) lines(data).map {|line| gem_parser.parse(line).unshift(name) } end + # parse the last, most recently updated line of the versions file to determine availability def available? return @available unless @available.nil? - @available = !info_checksums.empty? + return @available = false unless versions_data&.size&.nonzero? + + line_end = versions_data.size - 1 + return @available = false if versions_data[line_end] != "\n" + + line_start = versions_data.rindex("\n", line_end - 1) + line_start ||= -1 # allow a single line versions file + + @available = !split_last_word(versions_data, line_start + 1, line_end).nil? end private - def info_checksums - @info_checksums ||= lines(@compact_index.versions).each_with_object({}) do |line, checksums| - parse_version_checksum(line, checksums) - end - end - - def lines(data) - return [] if data.nil? || data.empty? - lines = data.split("\n") - header = lines.index("---") - header ? lines[header + 1..-1] : lines + # Search for a line starting with gem name, then return last space-separated word (the checksum) + def info_checksum(name) + return unless versions_data + return unless (line_start = rindex_of_gem(name)) + return unless (line_end = versions_data.index("\n", line_start)) + split_last_word(versions_data, line_start, line_end) end def gem_parser @gem_parser ||= GemParser.new end - # This is mostly the same as `split(" ", 3)` but it avoids allocating extra objects. - # This method gets called at least once for every gem when parsing versions. - def parse_version_checksum(line, checksums) - line.freeze # allows slicing into the string to not allocate a copy of the line - name_end = line.index(" ") - checksum_start = line.index(" ", name_end + 1) + 1 - checksum_end = line.size - checksum_start - # freeze name since it is used as a hash key - # pre-freezing means a frozen copy isn't created - name = line[0, name_end].freeze - checksum = line[checksum_start, checksum_end] - checksums[name] = checksum + def versions_data + @versions_data ||= begin + data = @compact_index.versions + strip_header!(data) if data + data.freeze + end + end + + def rindex_of_gem(name) + if (pos = versions_data.rindex("\n#{name} ")) + pos + 1 + elsif versions_data.start_with?("#{name} ") + 0 + end + end + + # This is similar to `string.split(" ").last` but it avoids allocating extra objects. + def split_last_word(string, line_start, line_end) + return unless line_start < line_end && line_start >= 0 + word_start = string.rindex(" ", line_end).to_i + 1 + return if word_start < line_start + string[word_start, line_end - word_start] + end + + def lines(string) + return [] if string.nil? || string.empty? + strip_header!(string) + string.split("\n") + end + + def strip_header!(string) + header_end = string.index("---\n") + string.slice!(0, header_end + 4) if header_end end end end diff --git a/spec/bundler/bundler/compact_index_client/parser_spec.rb b/spec/bundler/bundler/compact_index_client/parser_spec.rb index 1d1d41a5a8..45a08fd9ff 100644 --- a/spec/bundler/bundler/compact_index_client/parser_spec.rb +++ b/spec/bundler/bundler/compact_index_client/parser_spec.rb @@ -20,7 +20,9 @@ RSpec.describe Bundler::CompactIndexClient::Parser do let(:compact_index) { TestCompactIndexClient.new(names, versions, info_data) } let(:names) { "a\nb\nc\n" } - let(:versions) { <<~VERSIONS } + let(:versions) { <<~VERSIONS.dup } + created_at: 2024-05-01T00:00:04Z + --- a 1.0.0,1.0.1,1.1.0 aaa111 b 2.0.0,2.0.0-java bbb222 c 3.0.0,3.0.3,3.3.3 ccc333 @@ -33,7 +35,8 @@ RSpec.describe Bundler::CompactIndexClient::Parser do "c" => { "ccc333yanked" => c_info }, } end - let(:a_info) { <<~INFO } + let(:a_info) { <<~INFO.dup } + --- 1.0.0 |checksum:aaa1,ruby:>= 3.0.0,rubygems:>= 3.2.3 1.0.1 |checksum:aaa2,ruby:>= 3.0.0,rubygems:>= 3.2.3 1.1.0 |checksum:aaa3,ruby:>= 3.0.0,rubygems:>= 3.2.3 @@ -52,10 +55,45 @@ RSpec.describe Bundler::CompactIndexClient::Parser do expect(parser).to be_available end - it "returns false when versions are not available" do + it "returns true when versions has only one gem" do + compact_index.versions = +"a 1.0.0 aaa1\n" + expect(parser).to be_available + end + + it "returns true when versions has a gem and a header" do + compact_index.versions = +"---\na 1.0.0 aaa1\n" + expect(parser).to be_available + end + + it "returns true when versions has a gem and a header with header data" do + compact_index.versions = +"created_at: 2024-05-01T00:00:04Z\n---\na 1.0.0 aaa1\n" + expect(parser).to be_available + end + + it "returns false when versions has only the header" do + compact_index.versions = +"---\n" + expect(parser).not_to be_available + end + + it "returns false when versions has only the header with header data" do + compact_index.versions = +"created_at: 2024-05-01T00:00:04Z\n---\n" + expect(parser).not_to be_available + end + + it "returns false when versions index is not available" do compact_index.versions = nil expect(parser).not_to be_available end + + it "returns false when versions is empty" do + compact_index.versions = +"" + expect(parser).not_to be_available + end + + it "returns false when versions ends improperly without a newline" do + compact_index.versions = "a 1.0.0 aaa1" + expect(parser).not_to be_available + end end describe "#names" do