[ruby/yarp] Move efficient file reading using demand paging to librubyparser

* So it can be reused by the Fiddle backend, etc and not just the C extension.
* Add YP_STRING_MAPPED to use a consistent interface for yp_string_t.
  That way yp_string_free() can be used like for other string types.
* Fix handling of empty file for !HAVE_MMAP && !_WIN32

https://github.com/ruby/yarp/commit/e40bc35801
This commit is contained in:
Benoit Daloze 2023-07-29 16:49:54 +02:00 committed by Takashi Kokubun
parent 2ccaaaa101
commit e712bc9b93
Notes: git 2023-08-17 00:48:11 +00:00
4 changed files with 176 additions and 174 deletions

View File

@ -14,14 +14,6 @@ VALUE rb_cYARPParseResult;
/* IO of Ruby code */
/******************************************************************************/
// Represents an input of Ruby code. It can either be coming from a file or a
// string. If it's a file, we'll use demand paging to read the contents of the
// file into a string. If it's already a string, we'll reference it directly.
typedef struct {
const char *source;
size_t size;
} input_t;
// Check if the given filepath is a string. If it's nil, then return NULL. If
// it's not a string, then raise a type error. Otherwise return the filepath as
// a C string.
@ -41,142 +33,15 @@ check_filepath(VALUE filepath) {
return StringValueCStr(filepath);
}
// Read the file indicated by the filepath parameter into source and load its
// contents and size into the given input_t.
//
// We want to use demand paging as much as possible in order to avoid having to
// read the entire file into memory (which could be detrimental to performance
// for large files). This means that if we're on windows we'll use
// `MapViewOfFile`, on POSIX systems that have access to `mmap` we'll use
// `mmap`, and on other POSIX systems we'll use `read`.
static int
input_load_filepath(input_t *input, const char *filepath) {
#ifdef _WIN32
// Open the file for reading.
HANDLE file = CreateFile(filepath, GENERIC_READ, FILE_SHARE_READ, NULL, OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, NULL);
if (file == INVALID_HANDLE_VALUE) {
perror("CreateFile failed");
return 1;
}
// Get the file size.
DWORD file_size = GetFileSize(file, NULL);
if (file_size == INVALID_FILE_SIZE) {
CloseHandle(file);
perror("GetFileSize failed");
return 1;
}
// If the file is empty, then we don't need to do anything else, we'll set
// the source to a constant empty string and return.
if (!file_size) {
CloseHandle(file);
input->size = 0;
input->source = "";
return 0;
}
// Create a mapping of the file.
HANDLE mapping = CreateFileMapping(file, NULL, PAGE_READONLY, 0, 0, NULL);
if (mapping == NULL) {
CloseHandle(file);
perror("CreateFileMapping failed");
return 1;
}
// Map the file into memory.
input->source = (const char *) MapViewOfFile(mapping, FILE_MAP_READ, 0, 0, 0);
CloseHandle(mapping);
CloseHandle(file);
if (input->source == NULL) {
perror("MapViewOfFile failed");
return 1;
}
// Set the size of the source.
input->size = (size_t) file_size;
return 0;
#else
// Open the file for reading
int fd = open(filepath, O_RDONLY);
if (fd == -1) {
perror("open");
return 1;
}
// Stat the file to get the file size
struct stat sb;
if (fstat(fd, &sb) == -1) {
close(fd);
perror("fstat");
return 1;
}
// mmap the file descriptor to virtually get the contents
input->size = sb.st_size;
#ifdef HAVE_MMAP
if (!input->size) {
close(fd);
input->source = "";
return 0;
}
const char *result = mmap(NULL, input->size, PROT_READ, MAP_PRIVATE, fd, 0);
if (result == MAP_FAILED) {
perror("Map failed");
return 1;
} else {
input->source = result;
}
#else
input->source = malloc(input->size);
if (input->source == NULL) return 1;
ssize_t read_size = read(fd, (void *) input->source, input->size);
if (read_size < 0 || (size_t)read_size != input->size) {
perror("Read size is incorrect");
free((void *) input->source);
return 1;
}
#endif
close(fd);
return 0;
#endif
}
// Load the contents and size of the given string into the given input_t.
// Load the contents and size of the given string into the given yp_string_t.
static void
input_load_string(input_t *input, VALUE string) {
input_load_string(yp_string_t *input, VALUE string) {
// Check if the string is a string. If it's not, then raise a type error.
if (!RB_TYPE_P(string, T_STRING)) {
rb_raise(rb_eTypeError, "wrong argument type %"PRIsVALUE" (expected String)", rb_obj_class(string));
}
input->source = RSTRING_PTR(string);
input->size = RSTRING_LEN(string);
}
// Free any resources associated with the given input_t. This is the corollary
// function to source_file_load. It will unmap the file if it was mapped, or
// free the memory if it was allocated.
static void
input_unload_filepath(input_t *input) {
// We don't need to free anything with 0 sized files because we handle that
// with a constant string instead.
if (!input->size) return;
void *memory = (void *) input->source;
#if defined(_WIN32)
UnmapViewOfFile(memory);
#elif defined(HAVE_MMAP)
munmap(memory, input->size);
#else
free(memory);
#endif
yp_string_constant_init(input, RSTRING_PTR(string), RSTRING_LEN(string));
}
/******************************************************************************/
@ -185,14 +50,14 @@ input_unload_filepath(input_t *input) {
// Dump the AST corresponding to the given input to a string.
static VALUE
dump_input(input_t *input, const char *filepath) {
dump_input(yp_string_t *input, const char *filepath) {
yp_buffer_t buffer;
if (!yp_buffer_init(&buffer)) {
rb_raise(rb_eNoMemError, "failed to allocate memory");
}
yp_parser_t parser;
yp_parser_init(&parser, input->source, input->size, filepath);
yp_parser_init(&parser, yp_string_source(input), yp_string_length(input), filepath);
yp_node_t *node = yp_parse(&parser, false);
yp_serialize(&parser, node, &buffer);
@ -212,7 +77,7 @@ dump(int argc, VALUE *argv, VALUE self) {
VALUE filepath;
rb_scan_args(argc, argv, "11", &string, &filepath);
input_t input;
yp_string_t input;
input_load_string(&input, string);
return dump_input(&input, check_filepath(filepath));
}
@ -220,13 +85,13 @@ dump(int argc, VALUE *argv, VALUE self) {
// Dump the AST corresponding to the given file to a string.
static VALUE
dump_file(VALUE self, VALUE filepath) {
input_t input;
yp_string_t input;
const char *checked = check_filepath(filepath);
if (input_load_filepath(&input, checked) != 0) return Qnil;
if (!yp_string_mapped_init(&input, checked)) return Qnil;
VALUE value = dump_input(&input, checked);
input_unload_filepath(&input);
yp_string_free(&input);
return value;
}
@ -356,13 +221,13 @@ lex_encoding_changed_callback(yp_parser_t *parser) {
// Return an array of tokens corresponding to the given source.
static VALUE
lex_input(input_t *input, const char *filepath) {
lex_input(yp_string_t *input, const char *filepath) {
yp_parser_t parser;
yp_parser_init(&parser, input->source, input->size, filepath);
yp_parser_init(&parser, yp_string_source(input), yp_string_length(input), filepath);
yp_parser_register_encoding_changed_callback(&parser, lex_encoding_changed_callback);
VALUE offsets = rb_ary_new();
VALUE source_argv[] = { rb_str_new(input->source, input->size), offsets };
VALUE source_argv[] = { rb_str_new(yp_string_source(input), yp_string_length(input)), offsets };
VALUE source = rb_class_new_instance(2, source_argv, rb_cYARPSource);
lex_data_t lex_data = {
@ -410,7 +275,7 @@ lex(int argc, VALUE *argv, VALUE self) {
VALUE filepath;
rb_scan_args(argc, argv, "11", &string, &filepath);
input_t input;
yp_string_t input;
input_load_string(&input, string);
return lex_input(&input, check_filepath(filepath));
}
@ -418,13 +283,13 @@ lex(int argc, VALUE *argv, VALUE self) {
// Return an array of tokens corresponding to the given file.
static VALUE
lex_file(VALUE self, VALUE filepath) {
input_t input;
yp_string_t input;
const char *checked = check_filepath(filepath);
if (input_load_filepath(&input, checked) != 0) return Qnil;
if (!yp_string_mapped_init(&input, checked)) return Qnil;
VALUE value = lex_input(&input, checked);
input_unload_filepath(&input);
yp_string_free(&input);
return value;
}
@ -435,9 +300,9 @@ lex_file(VALUE self, VALUE filepath) {
// Parse the given input and return a ParseResult instance.
static VALUE
parse_input(input_t *input, const char *filepath) {
parse_input(yp_string_t *input, const char *filepath) {
yp_parser_t parser;
yp_parser_init(&parser, input->source, input->size, filepath);
yp_parser_init(&parser, yp_string_source(input), yp_string_length(input), filepath);
yp_node_t *node = yp_parse(&parser, false);
rb_encoding *encoding = rb_enc_find(parser.encoding.name);
@ -466,13 +331,14 @@ parse(int argc, VALUE *argv, VALUE self) {
VALUE filepath;
rb_scan_args(argc, argv, "11", &string, &filepath);
input_t input;
yp_string_t input;
input_load_string(&input, string);
#ifdef YARP_DEBUG_MODE_BUILD
char* dup = malloc(input.size);
memcpy(dup, input.source, input.size);
input.source = dup;
size_t length = yp_string_length(&input);
char* dup = malloc(length);
memcpy(dup, yp_string_source(&input), length);
yp_string_constant_init(&input, dup, length);
#endif
VALUE value = parse_input(&input, check_filepath(filepath));
@ -487,13 +353,13 @@ parse(int argc, VALUE *argv, VALUE self) {
// Parse the given file and return a ParseResult instance.
static VALUE
parse_file(VALUE self, VALUE filepath) {
input_t input;
yp_string_t input;
const char *checked = check_filepath(filepath);
if (input_load_filepath(&input, checked) != 0) return Qnil;
if (!yp_string_mapped_init(&input, checked)) return Qnil;
VALUE value = parse_input(&input, checked);
input_unload_filepath(&input);
yp_string_free(&input);
return value;
}
@ -586,13 +452,13 @@ memsize(VALUE self, VALUE string) {
// parser for memory and speed.
static VALUE
profile_file(VALUE self, VALUE filepath) {
input_t input;
yp_string_t input;
const char *checked = check_filepath(filepath);
if (input_load_filepath(&input, checked) != 0) return Qnil;
if (!yp_string_mapped_init(&input, checked)) return Qnil;
yp_parser_t parser;
yp_parser_init(&parser, input.source, input.size, checked);
yp_parser_init(&parser, yp_string_source(&input), yp_string_length(&input), checked);
yp_node_t *node = yp_parse(&parser, false);
yp_node_destroy(&parser, node);

View File

@ -5,16 +5,6 @@
#include <ruby/encoding.h>
#include "yarp.h"
// The following headers are necessary to read files using demand paging.
#ifdef _WIN32
#include <windows.h>
#else
#include <fcntl.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <unistd.h>
#endif
#define EXPECTED_YARP_VERSION "0.4.0"
VALUE yp_source_new(yp_parser_t *parser);

View File

@ -1,5 +1,15 @@
#include "yarp/util/yp_string.h"
// The following headers are necessary to read files using demand paging.
#ifdef _WIN32
#include <windows.h>
#else
#include <fcntl.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <unistd.h>
#endif
// Initialize a shared string that is based on initial input.
void
yp_string_shared_init(yp_string_t *string, const char *start, const char *end) {
@ -36,6 +46,17 @@ yp_string_constant_init(yp_string_t *string, const char *source, size_t length)
};
}
static void
yp_string_mapped_init_internal(yp_string_t *string, char *source, size_t length) {
*string = (yp_string_t) {
.type = YP_STRING_MAPPED,
.as.mapped = {
.source = source,
.length = length
}
};
}
// Returns the memory size associated with the string.
size_t
yp_string_memsize(const yp_string_t *string) {
@ -84,5 +105,113 @@ YP_EXPORTED_FUNCTION void
yp_string_free(yp_string_t *string) {
if (string->type == YP_STRING_OWNED) {
free(string->as.owned.source);
} else if (string->type == YP_STRING_MAPPED && string->as.mapped.length) {
void *memory = (void *) string->as.mapped.source;
#if defined(_WIN32)
UnmapViewOfFile(memory);
#elif defined(HAVE_MMAP)
munmap(memory, string->as.mapped.length);
#else
free(memory);
#endif
}
}
bool
yp_string_mapped_init(yp_string_t *string, const char *filepath) {
#ifdef _WIN32
// Open the file for reading.
HANDLE file = CreateFile(filepath, GENERIC_READ, FILE_SHARE_READ, NULL, OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, NULL);
if (file == INVALID_HANDLE_VALUE) {
perror("CreateFile failed");
return false;
}
// Get the file size.
DWORD file_size = GetFileSize(file, NULL);
if (file_size == INVALID_FILE_SIZE) {
CloseHandle(file);
perror("GetFileSize failed");
return false;
}
// If the file is empty, then we don't need to do anything else, we'll set
// the source to a constant empty string and return.
if (file_size == 0) {
CloseHandle(file);
yp_string_mapped_init_internal(string, "", 0);
return true;
}
// Create a mapping of the file.
HANDLE mapping = CreateFileMapping(file, NULL, PAGE_READONLY, 0, 0, NULL);
if (mapping == NULL) {
CloseHandle(file);
perror("CreateFileMapping failed");
return false;
}
// Map the file into memory.
char *source = (char *) MapViewOfFile(mapping, FILE_MAP_READ, 0, 0, 0);
CloseHandle(mapping);
CloseHandle(file);
if (source == NULL) {
perror("MapViewOfFile failed");
return false;
}
yp_string_mapped_init_internal(string, source, (size_t) file_size);
return true;
#else
// Open the file for reading
int fd = open(filepath, O_RDONLY);
if (fd == -1) {
perror("open");
return false;
}
// Stat the file to get the file size
struct stat sb;
if (fstat(fd, &sb) == -1) {
close(fd);
perror("fstat");
return false;
}
// mmap the file descriptor to virtually get the contents
size_t size = (size_t) sb.st_size;
char *source = NULL;
if (size == 0) {
close(fd);
yp_string_mapped_init_internal(string, "", 0);
return true;
}
#ifdef HAVE_MMAP
source = mmap(NULL, size, PROT_READ, MAP_PRIVATE, fd, 0);
if (source == MAP_FAILED) {
perror("Map failed");
return false;
}
#else
source = malloc(size);
if (source == NULL) {
return false;
}
ssize_t read_size = read(fd, (void *) source, size);
if (read_size < 0 || (size_t)read_size != size) {
perror("Read size is incorrect");
free((void *) source);
return false;
}
#endif
close(fd);
yp_string_mapped_init_internal(string, source, size);
return true;
#endif
}

View File

@ -3,13 +3,14 @@
#include "yarp/defines.h"
#include <stdbool.h>
#include <stddef.h>
#include <stdlib.h>
#include <string.h>
// This struct represents a string value.
typedef struct {
enum { YP_STRING_SHARED, YP_STRING_OWNED, YP_STRING_CONSTANT } type;
enum { YP_STRING_SHARED, YP_STRING_OWNED, YP_STRING_CONSTANT, YP_STRING_MAPPED } type;
union {
struct {
@ -26,6 +27,11 @@ typedef struct {
const char *source;
size_t length;
} constant;
struct {
char *source;
size_t length;
} mapped;
} as;
} yp_string_t;
@ -38,6 +44,17 @@ void yp_string_owned_init(yp_string_t *string, char *source, size_t length);
// Initialize a constant string that doesn't own its memory source.
void yp_string_constant_init(yp_string_t *string, const char *source, size_t length);
// Read the file indicated by the filepath parameter into source and load its
// contents and size into the given yp_string_t.
// The given yp_string_t should be freed using yp_string_free() when it is no longer used.
//
// We want to use demand paging as much as possible in order to avoid having to
// read the entire file into memory (which could be detrimental to performance
// for large files). This means that if we're on windows we'll use
// `MapViewOfFile`, on POSIX systems that have access to `mmap` we'll use
// `mmap`, and on other POSIX systems we'll use `read`.
bool yp_string_mapped_init(yp_string_t *string, const char *filepath);
// Returns the memory size associated with the string.
size_t yp_string_memsize(const yp_string_t *string);