[ruby/yarp] Move efficient file reading using demand paging to librubyparser

* So it can be reused by the Fiddle backend, etc and not just the C extension.
* Add YP_STRING_MAPPED to use a consistent interface for yp_string_t.
  That way yp_string_free() can be used like for other string types.
* Fix handling of empty file for !HAVE_MMAP && !_WIN32

https://github.com/ruby/yarp/commit/e40bc35801
This commit is contained in:
Benoit Daloze 2023-07-29 16:49:54 +02:00 committed by Takashi Kokubun
parent 2ccaaaa101
commit e712bc9b93
Notes: git 2023-08-17 00:48:11 +00:00
4 changed files with 176 additions and 174 deletions

View File

@ -14,14 +14,6 @@ VALUE rb_cYARPParseResult;
/* IO of Ruby code */ /* IO of Ruby code */
/******************************************************************************/ /******************************************************************************/
// Represents an input of Ruby code. It can either be coming from a file or a
// string. If it's a file, we'll use demand paging to read the contents of the
// file into a string. If it's already a string, we'll reference it directly.
typedef struct {
const char *source;
size_t size;
} input_t;
// Check if the given filepath is a string. If it's nil, then return NULL. If // Check if the given filepath is a string. If it's nil, then return NULL. If
// it's not a string, then raise a type error. Otherwise return the filepath as // it's not a string, then raise a type error. Otherwise return the filepath as
// a C string. // a C string.
@ -41,142 +33,15 @@ check_filepath(VALUE filepath) {
return StringValueCStr(filepath); return StringValueCStr(filepath);
} }
// Read the file indicated by the filepath parameter into source and load its // Load the contents and size of the given string into the given yp_string_t.
// contents and size into the given input_t.
//
// We want to use demand paging as much as possible in order to avoid having to
// read the entire file into memory (which could be detrimental to performance
// for large files). This means that if we're on windows we'll use
// `MapViewOfFile`, on POSIX systems that have access to `mmap` we'll use
// `mmap`, and on other POSIX systems we'll use `read`.
static int
input_load_filepath(input_t *input, const char *filepath) {
#ifdef _WIN32
// Open the file for reading.
HANDLE file = CreateFile(filepath, GENERIC_READ, FILE_SHARE_READ, NULL, OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, NULL);
if (file == INVALID_HANDLE_VALUE) {
perror("CreateFile failed");
return 1;
}
// Get the file size.
DWORD file_size = GetFileSize(file, NULL);
if (file_size == INVALID_FILE_SIZE) {
CloseHandle(file);
perror("GetFileSize failed");
return 1;
}
// If the file is empty, then we don't need to do anything else, we'll set
// the source to a constant empty string and return.
if (!file_size) {
CloseHandle(file);
input->size = 0;
input->source = "";
return 0;
}
// Create a mapping of the file.
HANDLE mapping = CreateFileMapping(file, NULL, PAGE_READONLY, 0, 0, NULL);
if (mapping == NULL) {
CloseHandle(file);
perror("CreateFileMapping failed");
return 1;
}
// Map the file into memory.
input->source = (const char *) MapViewOfFile(mapping, FILE_MAP_READ, 0, 0, 0);
CloseHandle(mapping);
CloseHandle(file);
if (input->source == NULL) {
perror("MapViewOfFile failed");
return 1;
}
// Set the size of the source.
input->size = (size_t) file_size;
return 0;
#else
// Open the file for reading
int fd = open(filepath, O_RDONLY);
if (fd == -1) {
perror("open");
return 1;
}
// Stat the file to get the file size
struct stat sb;
if (fstat(fd, &sb) == -1) {
close(fd);
perror("fstat");
return 1;
}
// mmap the file descriptor to virtually get the contents
input->size = sb.st_size;
#ifdef HAVE_MMAP
if (!input->size) {
close(fd);
input->source = "";
return 0;
}
const char *result = mmap(NULL, input->size, PROT_READ, MAP_PRIVATE, fd, 0);
if (result == MAP_FAILED) {
perror("Map failed");
return 1;
} else {
input->source = result;
}
#else
input->source = malloc(input->size);
if (input->source == NULL) return 1;
ssize_t read_size = read(fd, (void *) input->source, input->size);
if (read_size < 0 || (size_t)read_size != input->size) {
perror("Read size is incorrect");
free((void *) input->source);
return 1;
}
#endif
close(fd);
return 0;
#endif
}
// Load the contents and size of the given string into the given input_t.
static void static void
input_load_string(input_t *input, VALUE string) { input_load_string(yp_string_t *input, VALUE string) {
// Check if the string is a string. If it's not, then raise a type error. // Check if the string is a string. If it's not, then raise a type error.
if (!RB_TYPE_P(string, T_STRING)) { if (!RB_TYPE_P(string, T_STRING)) {
rb_raise(rb_eTypeError, "wrong argument type %"PRIsVALUE" (expected String)", rb_obj_class(string)); rb_raise(rb_eTypeError, "wrong argument type %"PRIsVALUE" (expected String)", rb_obj_class(string));
} }
input->source = RSTRING_PTR(string); yp_string_constant_init(input, RSTRING_PTR(string), RSTRING_LEN(string));
input->size = RSTRING_LEN(string);
}
// Free any resources associated with the given input_t. This is the corollary
// function to source_file_load. It will unmap the file if it was mapped, or
// free the memory if it was allocated.
static void
input_unload_filepath(input_t *input) {
// We don't need to free anything with 0 sized files because we handle that
// with a constant string instead.
if (!input->size) return;
void *memory = (void *) input->source;
#if defined(_WIN32)
UnmapViewOfFile(memory);
#elif defined(HAVE_MMAP)
munmap(memory, input->size);
#else
free(memory);
#endif
} }
/******************************************************************************/ /******************************************************************************/
@ -185,14 +50,14 @@ input_unload_filepath(input_t *input) {
// Dump the AST corresponding to the given input to a string. // Dump the AST corresponding to the given input to a string.
static VALUE static VALUE
dump_input(input_t *input, const char *filepath) { dump_input(yp_string_t *input, const char *filepath) {
yp_buffer_t buffer; yp_buffer_t buffer;
if (!yp_buffer_init(&buffer)) { if (!yp_buffer_init(&buffer)) {
rb_raise(rb_eNoMemError, "failed to allocate memory"); rb_raise(rb_eNoMemError, "failed to allocate memory");
} }
yp_parser_t parser; yp_parser_t parser;
yp_parser_init(&parser, input->source, input->size, filepath); yp_parser_init(&parser, yp_string_source(input), yp_string_length(input), filepath);
yp_node_t *node = yp_parse(&parser, false); yp_node_t *node = yp_parse(&parser, false);
yp_serialize(&parser, node, &buffer); yp_serialize(&parser, node, &buffer);
@ -212,7 +77,7 @@ dump(int argc, VALUE *argv, VALUE self) {
VALUE filepath; VALUE filepath;
rb_scan_args(argc, argv, "11", &string, &filepath); rb_scan_args(argc, argv, "11", &string, &filepath);
input_t input; yp_string_t input;
input_load_string(&input, string); input_load_string(&input, string);
return dump_input(&input, check_filepath(filepath)); return dump_input(&input, check_filepath(filepath));
} }
@ -220,13 +85,13 @@ dump(int argc, VALUE *argv, VALUE self) {
// Dump the AST corresponding to the given file to a string. // Dump the AST corresponding to the given file to a string.
static VALUE static VALUE
dump_file(VALUE self, VALUE filepath) { dump_file(VALUE self, VALUE filepath) {
input_t input; yp_string_t input;
const char *checked = check_filepath(filepath); const char *checked = check_filepath(filepath);
if (input_load_filepath(&input, checked) != 0) return Qnil; if (!yp_string_mapped_init(&input, checked)) return Qnil;
VALUE value = dump_input(&input, checked); VALUE value = dump_input(&input, checked);
input_unload_filepath(&input); yp_string_free(&input);
return value; return value;
} }
@ -356,13 +221,13 @@ lex_encoding_changed_callback(yp_parser_t *parser) {
// Return an array of tokens corresponding to the given source. // Return an array of tokens corresponding to the given source.
static VALUE static VALUE
lex_input(input_t *input, const char *filepath) { lex_input(yp_string_t *input, const char *filepath) {
yp_parser_t parser; yp_parser_t parser;
yp_parser_init(&parser, input->source, input->size, filepath); yp_parser_init(&parser, yp_string_source(input), yp_string_length(input), filepath);
yp_parser_register_encoding_changed_callback(&parser, lex_encoding_changed_callback); yp_parser_register_encoding_changed_callback(&parser, lex_encoding_changed_callback);
VALUE offsets = rb_ary_new(); VALUE offsets = rb_ary_new();
VALUE source_argv[] = { rb_str_new(input->source, input->size), offsets }; VALUE source_argv[] = { rb_str_new(yp_string_source(input), yp_string_length(input)), offsets };
VALUE source = rb_class_new_instance(2, source_argv, rb_cYARPSource); VALUE source = rb_class_new_instance(2, source_argv, rb_cYARPSource);
lex_data_t lex_data = { lex_data_t lex_data = {
@ -410,7 +275,7 @@ lex(int argc, VALUE *argv, VALUE self) {
VALUE filepath; VALUE filepath;
rb_scan_args(argc, argv, "11", &string, &filepath); rb_scan_args(argc, argv, "11", &string, &filepath);
input_t input; yp_string_t input;
input_load_string(&input, string); input_load_string(&input, string);
return lex_input(&input, check_filepath(filepath)); return lex_input(&input, check_filepath(filepath));
} }
@ -418,13 +283,13 @@ lex(int argc, VALUE *argv, VALUE self) {
// Return an array of tokens corresponding to the given file. // Return an array of tokens corresponding to the given file.
static VALUE static VALUE
lex_file(VALUE self, VALUE filepath) { lex_file(VALUE self, VALUE filepath) {
input_t input; yp_string_t input;
const char *checked = check_filepath(filepath); const char *checked = check_filepath(filepath);
if (input_load_filepath(&input, checked) != 0) return Qnil; if (!yp_string_mapped_init(&input, checked)) return Qnil;
VALUE value = lex_input(&input, checked); VALUE value = lex_input(&input, checked);
input_unload_filepath(&input); yp_string_free(&input);
return value; return value;
} }
@ -435,9 +300,9 @@ lex_file(VALUE self, VALUE filepath) {
// Parse the given input and return a ParseResult instance. // Parse the given input and return a ParseResult instance.
static VALUE static VALUE
parse_input(input_t *input, const char *filepath) { parse_input(yp_string_t *input, const char *filepath) {
yp_parser_t parser; yp_parser_t parser;
yp_parser_init(&parser, input->source, input->size, filepath); yp_parser_init(&parser, yp_string_source(input), yp_string_length(input), filepath);
yp_node_t *node = yp_parse(&parser, false); yp_node_t *node = yp_parse(&parser, false);
rb_encoding *encoding = rb_enc_find(parser.encoding.name); rb_encoding *encoding = rb_enc_find(parser.encoding.name);
@ -466,13 +331,14 @@ parse(int argc, VALUE *argv, VALUE self) {
VALUE filepath; VALUE filepath;
rb_scan_args(argc, argv, "11", &string, &filepath); rb_scan_args(argc, argv, "11", &string, &filepath);
input_t input; yp_string_t input;
input_load_string(&input, string); input_load_string(&input, string);
#ifdef YARP_DEBUG_MODE_BUILD #ifdef YARP_DEBUG_MODE_BUILD
char* dup = malloc(input.size); size_t length = yp_string_length(&input);
memcpy(dup, input.source, input.size); char* dup = malloc(length);
input.source = dup; memcpy(dup, yp_string_source(&input), length);
yp_string_constant_init(&input, dup, length);
#endif #endif
VALUE value = parse_input(&input, check_filepath(filepath)); VALUE value = parse_input(&input, check_filepath(filepath));
@ -487,13 +353,13 @@ parse(int argc, VALUE *argv, VALUE self) {
// Parse the given file and return a ParseResult instance. // Parse the given file and return a ParseResult instance.
static VALUE static VALUE
parse_file(VALUE self, VALUE filepath) { parse_file(VALUE self, VALUE filepath) {
input_t input; yp_string_t input;
const char *checked = check_filepath(filepath); const char *checked = check_filepath(filepath);
if (input_load_filepath(&input, checked) != 0) return Qnil; if (!yp_string_mapped_init(&input, checked)) return Qnil;
VALUE value = parse_input(&input, checked); VALUE value = parse_input(&input, checked);
input_unload_filepath(&input); yp_string_free(&input);
return value; return value;
} }
@ -586,13 +452,13 @@ memsize(VALUE self, VALUE string) {
// parser for memory and speed. // parser for memory and speed.
static VALUE static VALUE
profile_file(VALUE self, VALUE filepath) { profile_file(VALUE self, VALUE filepath) {
input_t input; yp_string_t input;
const char *checked = check_filepath(filepath); const char *checked = check_filepath(filepath);
if (input_load_filepath(&input, checked) != 0) return Qnil; if (!yp_string_mapped_init(&input, checked)) return Qnil;
yp_parser_t parser; yp_parser_t parser;
yp_parser_init(&parser, input.source, input.size, checked); yp_parser_init(&parser, yp_string_source(&input), yp_string_length(&input), checked);
yp_node_t *node = yp_parse(&parser, false); yp_node_t *node = yp_parse(&parser, false);
yp_node_destroy(&parser, node); yp_node_destroy(&parser, node);

View File

@ -5,16 +5,6 @@
#include <ruby/encoding.h> #include <ruby/encoding.h>
#include "yarp.h" #include "yarp.h"
// The following headers are necessary to read files using demand paging.
#ifdef _WIN32
#include <windows.h>
#else
#include <fcntl.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <unistd.h>
#endif
#define EXPECTED_YARP_VERSION "0.4.0" #define EXPECTED_YARP_VERSION "0.4.0"
VALUE yp_source_new(yp_parser_t *parser); VALUE yp_source_new(yp_parser_t *parser);

View File

@ -1,5 +1,15 @@
#include "yarp/util/yp_string.h" #include "yarp/util/yp_string.h"
// The following headers are necessary to read files using demand paging.
#ifdef _WIN32
#include <windows.h>
#else
#include <fcntl.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <unistd.h>
#endif
// Initialize a shared string that is based on initial input. // Initialize a shared string that is based on initial input.
void void
yp_string_shared_init(yp_string_t *string, const char *start, const char *end) { yp_string_shared_init(yp_string_t *string, const char *start, const char *end) {
@ -36,6 +46,17 @@ yp_string_constant_init(yp_string_t *string, const char *source, size_t length)
}; };
} }
static void
yp_string_mapped_init_internal(yp_string_t *string, char *source, size_t length) {
*string = (yp_string_t) {
.type = YP_STRING_MAPPED,
.as.mapped = {
.source = source,
.length = length
}
};
}
// Returns the memory size associated with the string. // Returns the memory size associated with the string.
size_t size_t
yp_string_memsize(const yp_string_t *string) { yp_string_memsize(const yp_string_t *string) {
@ -84,5 +105,113 @@ YP_EXPORTED_FUNCTION void
yp_string_free(yp_string_t *string) { yp_string_free(yp_string_t *string) {
if (string->type == YP_STRING_OWNED) { if (string->type == YP_STRING_OWNED) {
free(string->as.owned.source); free(string->as.owned.source);
} else if (string->type == YP_STRING_MAPPED && string->as.mapped.length) {
void *memory = (void *) string->as.mapped.source;
#if defined(_WIN32)
UnmapViewOfFile(memory);
#elif defined(HAVE_MMAP)
munmap(memory, string->as.mapped.length);
#else
free(memory);
#endif
} }
} }
bool
yp_string_mapped_init(yp_string_t *string, const char *filepath) {
#ifdef _WIN32
// Open the file for reading.
HANDLE file = CreateFile(filepath, GENERIC_READ, FILE_SHARE_READ, NULL, OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, NULL);
if (file == INVALID_HANDLE_VALUE) {
perror("CreateFile failed");
return false;
}
// Get the file size.
DWORD file_size = GetFileSize(file, NULL);
if (file_size == INVALID_FILE_SIZE) {
CloseHandle(file);
perror("GetFileSize failed");
return false;
}
// If the file is empty, then we don't need to do anything else, we'll set
// the source to a constant empty string and return.
if (file_size == 0) {
CloseHandle(file);
yp_string_mapped_init_internal(string, "", 0);
return true;
}
// Create a mapping of the file.
HANDLE mapping = CreateFileMapping(file, NULL, PAGE_READONLY, 0, 0, NULL);
if (mapping == NULL) {
CloseHandle(file);
perror("CreateFileMapping failed");
return false;
}
// Map the file into memory.
char *source = (char *) MapViewOfFile(mapping, FILE_MAP_READ, 0, 0, 0);
CloseHandle(mapping);
CloseHandle(file);
if (source == NULL) {
perror("MapViewOfFile failed");
return false;
}
yp_string_mapped_init_internal(string, source, (size_t) file_size);
return true;
#else
// Open the file for reading
int fd = open(filepath, O_RDONLY);
if (fd == -1) {
perror("open");
return false;
}
// Stat the file to get the file size
struct stat sb;
if (fstat(fd, &sb) == -1) {
close(fd);
perror("fstat");
return false;
}
// mmap the file descriptor to virtually get the contents
size_t size = (size_t) sb.st_size;
char *source = NULL;
if (size == 0) {
close(fd);
yp_string_mapped_init_internal(string, "", 0);
return true;
}
#ifdef HAVE_MMAP
source = mmap(NULL, size, PROT_READ, MAP_PRIVATE, fd, 0);
if (source == MAP_FAILED) {
perror("Map failed");
return false;
}
#else
source = malloc(size);
if (source == NULL) {
return false;
}
ssize_t read_size = read(fd, (void *) source, size);
if (read_size < 0 || (size_t)read_size != size) {
perror("Read size is incorrect");
free((void *) source);
return false;
}
#endif
close(fd);
yp_string_mapped_init_internal(string, source, size);
return true;
#endif
}

View File

@ -3,13 +3,14 @@
#include "yarp/defines.h" #include "yarp/defines.h"
#include <stdbool.h>
#include <stddef.h> #include <stddef.h>
#include <stdlib.h> #include <stdlib.h>
#include <string.h> #include <string.h>
// This struct represents a string value. // This struct represents a string value.
typedef struct { typedef struct {
enum { YP_STRING_SHARED, YP_STRING_OWNED, YP_STRING_CONSTANT } type; enum { YP_STRING_SHARED, YP_STRING_OWNED, YP_STRING_CONSTANT, YP_STRING_MAPPED } type;
union { union {
struct { struct {
@ -26,6 +27,11 @@ typedef struct {
const char *source; const char *source;
size_t length; size_t length;
} constant; } constant;
struct {
char *source;
size_t length;
} mapped;
} as; } as;
} yp_string_t; } yp_string_t;
@ -38,6 +44,17 @@ void yp_string_owned_init(yp_string_t *string, char *source, size_t length);
// Initialize a constant string that doesn't own its memory source. // Initialize a constant string that doesn't own its memory source.
void yp_string_constant_init(yp_string_t *string, const char *source, size_t length); void yp_string_constant_init(yp_string_t *string, const char *source, size_t length);
// Read the file indicated by the filepath parameter into source and load its
// contents and size into the given yp_string_t.
// The given yp_string_t should be freed using yp_string_free() when it is no longer used.
//
// We want to use demand paging as much as possible in order to avoid having to
// read the entire file into memory (which could be detrimental to performance
// for large files). This means that if we're on windows we'll use
// `MapViewOfFile`, on POSIX systems that have access to `mmap` we'll use
// `mmap`, and on other POSIX systems we'll use `read`.
bool yp_string_mapped_init(yp_string_t *string, const char *filepath);
// Returns the memory size associated with the string. // Returns the memory size associated with the string.
size_t yp_string_memsize(const yp_string_t *string); size_t yp_string_memsize(const yp_string_t *string);