[ruby/yarp] Move efficient file reading using demand paging to librubyparser
* So it can be reused by the Fiddle backend, etc and not just the C extension. * Add YP_STRING_MAPPED to use a consistent interface for yp_string_t. That way yp_string_free() can be used like for other string types. * Fix handling of empty file for !HAVE_MMAP && !_WIN32 https://github.com/ruby/yarp/commit/e40bc35801
This commit is contained in:
parent
2ccaaaa101
commit
e712bc9b93
Notes:
git
2023-08-17 00:48:11 +00:00
192
yarp/extension.c
192
yarp/extension.c
@ -14,14 +14,6 @@ VALUE rb_cYARPParseResult;
|
||||
/* IO of Ruby code */
|
||||
/******************************************************************************/
|
||||
|
||||
// Represents an input of Ruby code. It can either be coming from a file or a
|
||||
// string. If it's a file, we'll use demand paging to read the contents of the
|
||||
// file into a string. If it's already a string, we'll reference it directly.
|
||||
typedef struct {
|
||||
const char *source;
|
||||
size_t size;
|
||||
} input_t;
|
||||
|
||||
// Check if the given filepath is a string. If it's nil, then return NULL. If
|
||||
// it's not a string, then raise a type error. Otherwise return the filepath as
|
||||
// a C string.
|
||||
@ -41,142 +33,15 @@ check_filepath(VALUE filepath) {
|
||||
return StringValueCStr(filepath);
|
||||
}
|
||||
|
||||
// Read the file indicated by the filepath parameter into source and load its
|
||||
// contents and size into the given input_t.
|
||||
//
|
||||
// We want to use demand paging as much as possible in order to avoid having to
|
||||
// read the entire file into memory (which could be detrimental to performance
|
||||
// for large files). This means that if we're on windows we'll use
|
||||
// `MapViewOfFile`, on POSIX systems that have access to `mmap` we'll use
|
||||
// `mmap`, and on other POSIX systems we'll use `read`.
|
||||
static int
|
||||
input_load_filepath(input_t *input, const char *filepath) {
|
||||
#ifdef _WIN32
|
||||
// Open the file for reading.
|
||||
HANDLE file = CreateFile(filepath, GENERIC_READ, FILE_SHARE_READ, NULL, OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, NULL);
|
||||
|
||||
if (file == INVALID_HANDLE_VALUE) {
|
||||
perror("CreateFile failed");
|
||||
return 1;
|
||||
}
|
||||
|
||||
// Get the file size.
|
||||
DWORD file_size = GetFileSize(file, NULL);
|
||||
if (file_size == INVALID_FILE_SIZE) {
|
||||
CloseHandle(file);
|
||||
perror("GetFileSize failed");
|
||||
return 1;
|
||||
}
|
||||
|
||||
// If the file is empty, then we don't need to do anything else, we'll set
|
||||
// the source to a constant empty string and return.
|
||||
if (!file_size) {
|
||||
CloseHandle(file);
|
||||
input->size = 0;
|
||||
input->source = "";
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Create a mapping of the file.
|
||||
HANDLE mapping = CreateFileMapping(file, NULL, PAGE_READONLY, 0, 0, NULL);
|
||||
if (mapping == NULL) {
|
||||
CloseHandle(file);
|
||||
perror("CreateFileMapping failed");
|
||||
return 1;
|
||||
}
|
||||
|
||||
// Map the file into memory.
|
||||
input->source = (const char *) MapViewOfFile(mapping, FILE_MAP_READ, 0, 0, 0);
|
||||
CloseHandle(mapping);
|
||||
CloseHandle(file);
|
||||
|
||||
if (input->source == NULL) {
|
||||
perror("MapViewOfFile failed");
|
||||
return 1;
|
||||
}
|
||||
|
||||
// Set the size of the source.
|
||||
input->size = (size_t) file_size;
|
||||
return 0;
|
||||
#else
|
||||
// Open the file for reading
|
||||
int fd = open(filepath, O_RDONLY);
|
||||
if (fd == -1) {
|
||||
perror("open");
|
||||
return 1;
|
||||
}
|
||||
|
||||
// Stat the file to get the file size
|
||||
struct stat sb;
|
||||
if (fstat(fd, &sb) == -1) {
|
||||
close(fd);
|
||||
perror("fstat");
|
||||
return 1;
|
||||
}
|
||||
|
||||
// mmap the file descriptor to virtually get the contents
|
||||
input->size = sb.st_size;
|
||||
|
||||
#ifdef HAVE_MMAP
|
||||
if (!input->size) {
|
||||
close(fd);
|
||||
input->source = "";
|
||||
return 0;
|
||||
}
|
||||
|
||||
const char *result = mmap(NULL, input->size, PROT_READ, MAP_PRIVATE, fd, 0);
|
||||
if (result == MAP_FAILED) {
|
||||
perror("Map failed");
|
||||
return 1;
|
||||
} else {
|
||||
input->source = result;
|
||||
}
|
||||
#else
|
||||
input->source = malloc(input->size);
|
||||
if (input->source == NULL) return 1;
|
||||
|
||||
ssize_t read_size = read(fd, (void *) input->source, input->size);
|
||||
if (read_size < 0 || (size_t)read_size != input->size) {
|
||||
perror("Read size is incorrect");
|
||||
free((void *) input->source);
|
||||
return 1;
|
||||
}
|
||||
#endif
|
||||
|
||||
close(fd);
|
||||
return 0;
|
||||
#endif
|
||||
}
|
||||
|
||||
// Load the contents and size of the given string into the given input_t.
|
||||
// Load the contents and size of the given string into the given yp_string_t.
|
||||
static void
|
||||
input_load_string(input_t *input, VALUE string) {
|
||||
input_load_string(yp_string_t *input, VALUE string) {
|
||||
// Check if the string is a string. If it's not, then raise a type error.
|
||||
if (!RB_TYPE_P(string, T_STRING)) {
|
||||
rb_raise(rb_eTypeError, "wrong argument type %"PRIsVALUE" (expected String)", rb_obj_class(string));
|
||||
}
|
||||
|
||||
input->source = RSTRING_PTR(string);
|
||||
input->size = RSTRING_LEN(string);
|
||||
}
|
||||
|
||||
// Free any resources associated with the given input_t. This is the corollary
|
||||
// function to source_file_load. It will unmap the file if it was mapped, or
|
||||
// free the memory if it was allocated.
|
||||
static void
|
||||
input_unload_filepath(input_t *input) {
|
||||
// We don't need to free anything with 0 sized files because we handle that
|
||||
// with a constant string instead.
|
||||
if (!input->size) return;
|
||||
void *memory = (void *) input->source;
|
||||
|
||||
#if defined(_WIN32)
|
||||
UnmapViewOfFile(memory);
|
||||
#elif defined(HAVE_MMAP)
|
||||
munmap(memory, input->size);
|
||||
#else
|
||||
free(memory);
|
||||
#endif
|
||||
yp_string_constant_init(input, RSTRING_PTR(string), RSTRING_LEN(string));
|
||||
}
|
||||
|
||||
/******************************************************************************/
|
||||
@ -185,14 +50,14 @@ input_unload_filepath(input_t *input) {
|
||||
|
||||
// Dump the AST corresponding to the given input to a string.
|
||||
static VALUE
|
||||
dump_input(input_t *input, const char *filepath) {
|
||||
dump_input(yp_string_t *input, const char *filepath) {
|
||||
yp_buffer_t buffer;
|
||||
if (!yp_buffer_init(&buffer)) {
|
||||
rb_raise(rb_eNoMemError, "failed to allocate memory");
|
||||
}
|
||||
|
||||
yp_parser_t parser;
|
||||
yp_parser_init(&parser, input->source, input->size, filepath);
|
||||
yp_parser_init(&parser, yp_string_source(input), yp_string_length(input), filepath);
|
||||
|
||||
yp_node_t *node = yp_parse(&parser, false);
|
||||
yp_serialize(&parser, node, &buffer);
|
||||
@ -212,7 +77,7 @@ dump(int argc, VALUE *argv, VALUE self) {
|
||||
VALUE filepath;
|
||||
rb_scan_args(argc, argv, "11", &string, &filepath);
|
||||
|
||||
input_t input;
|
||||
yp_string_t input;
|
||||
input_load_string(&input, string);
|
||||
return dump_input(&input, check_filepath(filepath));
|
||||
}
|
||||
@ -220,13 +85,13 @@ dump(int argc, VALUE *argv, VALUE self) {
|
||||
// Dump the AST corresponding to the given file to a string.
|
||||
static VALUE
|
||||
dump_file(VALUE self, VALUE filepath) {
|
||||
input_t input;
|
||||
yp_string_t input;
|
||||
|
||||
const char *checked = check_filepath(filepath);
|
||||
if (input_load_filepath(&input, checked) != 0) return Qnil;
|
||||
if (!yp_string_mapped_init(&input, checked)) return Qnil;
|
||||
|
||||
VALUE value = dump_input(&input, checked);
|
||||
input_unload_filepath(&input);
|
||||
yp_string_free(&input);
|
||||
|
||||
return value;
|
||||
}
|
||||
@ -356,13 +221,13 @@ lex_encoding_changed_callback(yp_parser_t *parser) {
|
||||
|
||||
// Return an array of tokens corresponding to the given source.
|
||||
static VALUE
|
||||
lex_input(input_t *input, const char *filepath) {
|
||||
lex_input(yp_string_t *input, const char *filepath) {
|
||||
yp_parser_t parser;
|
||||
yp_parser_init(&parser, input->source, input->size, filepath);
|
||||
yp_parser_init(&parser, yp_string_source(input), yp_string_length(input), filepath);
|
||||
yp_parser_register_encoding_changed_callback(&parser, lex_encoding_changed_callback);
|
||||
|
||||
VALUE offsets = rb_ary_new();
|
||||
VALUE source_argv[] = { rb_str_new(input->source, input->size), offsets };
|
||||
VALUE source_argv[] = { rb_str_new(yp_string_source(input), yp_string_length(input)), offsets };
|
||||
VALUE source = rb_class_new_instance(2, source_argv, rb_cYARPSource);
|
||||
|
||||
lex_data_t lex_data = {
|
||||
@ -410,7 +275,7 @@ lex(int argc, VALUE *argv, VALUE self) {
|
||||
VALUE filepath;
|
||||
rb_scan_args(argc, argv, "11", &string, &filepath);
|
||||
|
||||
input_t input;
|
||||
yp_string_t input;
|
||||
input_load_string(&input, string);
|
||||
return lex_input(&input, check_filepath(filepath));
|
||||
}
|
||||
@ -418,13 +283,13 @@ lex(int argc, VALUE *argv, VALUE self) {
|
||||
// Return an array of tokens corresponding to the given file.
|
||||
static VALUE
|
||||
lex_file(VALUE self, VALUE filepath) {
|
||||
input_t input;
|
||||
yp_string_t input;
|
||||
|
||||
const char *checked = check_filepath(filepath);
|
||||
if (input_load_filepath(&input, checked) != 0) return Qnil;
|
||||
if (!yp_string_mapped_init(&input, checked)) return Qnil;
|
||||
|
||||
VALUE value = lex_input(&input, checked);
|
||||
input_unload_filepath(&input);
|
||||
yp_string_free(&input);
|
||||
|
||||
return value;
|
||||
}
|
||||
@ -435,9 +300,9 @@ lex_file(VALUE self, VALUE filepath) {
|
||||
|
||||
// Parse the given input and return a ParseResult instance.
|
||||
static VALUE
|
||||
parse_input(input_t *input, const char *filepath) {
|
||||
parse_input(yp_string_t *input, const char *filepath) {
|
||||
yp_parser_t parser;
|
||||
yp_parser_init(&parser, input->source, input->size, filepath);
|
||||
yp_parser_init(&parser, yp_string_source(input), yp_string_length(input), filepath);
|
||||
|
||||
yp_node_t *node = yp_parse(&parser, false);
|
||||
rb_encoding *encoding = rb_enc_find(parser.encoding.name);
|
||||
@ -466,13 +331,14 @@ parse(int argc, VALUE *argv, VALUE self) {
|
||||
VALUE filepath;
|
||||
rb_scan_args(argc, argv, "11", &string, &filepath);
|
||||
|
||||
input_t input;
|
||||
yp_string_t input;
|
||||
input_load_string(&input, string);
|
||||
|
||||
#ifdef YARP_DEBUG_MODE_BUILD
|
||||
char* dup = malloc(input.size);
|
||||
memcpy(dup, input.source, input.size);
|
||||
input.source = dup;
|
||||
size_t length = yp_string_length(&input);
|
||||
char* dup = malloc(length);
|
||||
memcpy(dup, yp_string_source(&input), length);
|
||||
yp_string_constant_init(&input, dup, length);
|
||||
#endif
|
||||
|
||||
VALUE value = parse_input(&input, check_filepath(filepath));
|
||||
@ -487,13 +353,13 @@ parse(int argc, VALUE *argv, VALUE self) {
|
||||
// Parse the given file and return a ParseResult instance.
|
||||
static VALUE
|
||||
parse_file(VALUE self, VALUE filepath) {
|
||||
input_t input;
|
||||
yp_string_t input;
|
||||
|
||||
const char *checked = check_filepath(filepath);
|
||||
if (input_load_filepath(&input, checked) != 0) return Qnil;
|
||||
if (!yp_string_mapped_init(&input, checked)) return Qnil;
|
||||
|
||||
VALUE value = parse_input(&input, checked);
|
||||
input_unload_filepath(&input);
|
||||
yp_string_free(&input);
|
||||
|
||||
return value;
|
||||
}
|
||||
@ -586,13 +452,13 @@ memsize(VALUE self, VALUE string) {
|
||||
// parser for memory and speed.
|
||||
static VALUE
|
||||
profile_file(VALUE self, VALUE filepath) {
|
||||
input_t input;
|
||||
yp_string_t input;
|
||||
|
||||
const char *checked = check_filepath(filepath);
|
||||
if (input_load_filepath(&input, checked) != 0) return Qnil;
|
||||
if (!yp_string_mapped_init(&input, checked)) return Qnil;
|
||||
|
||||
yp_parser_t parser;
|
||||
yp_parser_init(&parser, input.source, input.size, checked);
|
||||
yp_parser_init(&parser, yp_string_source(&input), yp_string_length(&input), checked);
|
||||
|
||||
yp_node_t *node = yp_parse(&parser, false);
|
||||
yp_node_destroy(&parser, node);
|
||||
|
@ -5,16 +5,6 @@
|
||||
#include <ruby/encoding.h>
|
||||
#include "yarp.h"
|
||||
|
||||
// The following headers are necessary to read files using demand paging.
|
||||
#ifdef _WIN32
|
||||
#include <windows.h>
|
||||
#else
|
||||
#include <fcntl.h>
|
||||
#include <sys/mman.h>
|
||||
#include <sys/stat.h>
|
||||
#include <unistd.h>
|
||||
#endif
|
||||
|
||||
#define EXPECTED_YARP_VERSION "0.4.0"
|
||||
|
||||
VALUE yp_source_new(yp_parser_t *parser);
|
||||
|
@ -1,5 +1,15 @@
|
||||
#include "yarp/util/yp_string.h"
|
||||
|
||||
// The following headers are necessary to read files using demand paging.
|
||||
#ifdef _WIN32
|
||||
#include <windows.h>
|
||||
#else
|
||||
#include <fcntl.h>
|
||||
#include <sys/mman.h>
|
||||
#include <sys/stat.h>
|
||||
#include <unistd.h>
|
||||
#endif
|
||||
|
||||
// Initialize a shared string that is based on initial input.
|
||||
void
|
||||
yp_string_shared_init(yp_string_t *string, const char *start, const char *end) {
|
||||
@ -36,6 +46,17 @@ yp_string_constant_init(yp_string_t *string, const char *source, size_t length)
|
||||
};
|
||||
}
|
||||
|
||||
static void
|
||||
yp_string_mapped_init_internal(yp_string_t *string, char *source, size_t length) {
|
||||
*string = (yp_string_t) {
|
||||
.type = YP_STRING_MAPPED,
|
||||
.as.mapped = {
|
||||
.source = source,
|
||||
.length = length
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
// Returns the memory size associated with the string.
|
||||
size_t
|
||||
yp_string_memsize(const yp_string_t *string) {
|
||||
@ -84,5 +105,113 @@ YP_EXPORTED_FUNCTION void
|
||||
yp_string_free(yp_string_t *string) {
|
||||
if (string->type == YP_STRING_OWNED) {
|
||||
free(string->as.owned.source);
|
||||
} else if (string->type == YP_STRING_MAPPED && string->as.mapped.length) {
|
||||
void *memory = (void *) string->as.mapped.source;
|
||||
#if defined(_WIN32)
|
||||
UnmapViewOfFile(memory);
|
||||
#elif defined(HAVE_MMAP)
|
||||
munmap(memory, string->as.mapped.length);
|
||||
#else
|
||||
free(memory);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
bool
|
||||
yp_string_mapped_init(yp_string_t *string, const char *filepath) {
|
||||
#ifdef _WIN32
|
||||
// Open the file for reading.
|
||||
HANDLE file = CreateFile(filepath, GENERIC_READ, FILE_SHARE_READ, NULL, OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, NULL);
|
||||
|
||||
if (file == INVALID_HANDLE_VALUE) {
|
||||
perror("CreateFile failed");
|
||||
return false;
|
||||
}
|
||||
|
||||
// Get the file size.
|
||||
DWORD file_size = GetFileSize(file, NULL);
|
||||
if (file_size == INVALID_FILE_SIZE) {
|
||||
CloseHandle(file);
|
||||
perror("GetFileSize failed");
|
||||
return false;
|
||||
}
|
||||
|
||||
// If the file is empty, then we don't need to do anything else, we'll set
|
||||
// the source to a constant empty string and return.
|
||||
if (file_size == 0) {
|
||||
CloseHandle(file);
|
||||
yp_string_mapped_init_internal(string, "", 0);
|
||||
return true;
|
||||
}
|
||||
|
||||
// Create a mapping of the file.
|
||||
HANDLE mapping = CreateFileMapping(file, NULL, PAGE_READONLY, 0, 0, NULL);
|
||||
if (mapping == NULL) {
|
||||
CloseHandle(file);
|
||||
perror("CreateFileMapping failed");
|
||||
return false;
|
||||
}
|
||||
|
||||
// Map the file into memory.
|
||||
char *source = (char *) MapViewOfFile(mapping, FILE_MAP_READ, 0, 0, 0);
|
||||
CloseHandle(mapping);
|
||||
CloseHandle(file);
|
||||
|
||||
if (source == NULL) {
|
||||
perror("MapViewOfFile failed");
|
||||
return false;
|
||||
}
|
||||
|
||||
yp_string_mapped_init_internal(string, source, (size_t) file_size);
|
||||
return true;
|
||||
#else
|
||||
// Open the file for reading
|
||||
int fd = open(filepath, O_RDONLY);
|
||||
if (fd == -1) {
|
||||
perror("open");
|
||||
return false;
|
||||
}
|
||||
|
||||
// Stat the file to get the file size
|
||||
struct stat sb;
|
||||
if (fstat(fd, &sb) == -1) {
|
||||
close(fd);
|
||||
perror("fstat");
|
||||
return false;
|
||||
}
|
||||
|
||||
// mmap the file descriptor to virtually get the contents
|
||||
size_t size = (size_t) sb.st_size;
|
||||
char *source = NULL;
|
||||
|
||||
if (size == 0) {
|
||||
close(fd);
|
||||
yp_string_mapped_init_internal(string, "", 0);
|
||||
return true;
|
||||
}
|
||||
|
||||
#ifdef HAVE_MMAP
|
||||
source = mmap(NULL, size, PROT_READ, MAP_PRIVATE, fd, 0);
|
||||
if (source == MAP_FAILED) {
|
||||
perror("Map failed");
|
||||
return false;
|
||||
}
|
||||
#else
|
||||
source = malloc(size);
|
||||
if (source == NULL) {
|
||||
return false;
|
||||
}
|
||||
|
||||
ssize_t read_size = read(fd, (void *) source, size);
|
||||
if (read_size < 0 || (size_t)read_size != size) {
|
||||
perror("Read size is incorrect");
|
||||
free((void *) source);
|
||||
return false;
|
||||
}
|
||||
#endif
|
||||
|
||||
close(fd);
|
||||
yp_string_mapped_init_internal(string, source, size);
|
||||
return true;
|
||||
#endif
|
||||
}
|
||||
|
@ -3,13 +3,14 @@
|
||||
|
||||
#include "yarp/defines.h"
|
||||
|
||||
#include <stdbool.h>
|
||||
#include <stddef.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
// This struct represents a string value.
|
||||
typedef struct {
|
||||
enum { YP_STRING_SHARED, YP_STRING_OWNED, YP_STRING_CONSTANT } type;
|
||||
enum { YP_STRING_SHARED, YP_STRING_OWNED, YP_STRING_CONSTANT, YP_STRING_MAPPED } type;
|
||||
|
||||
union {
|
||||
struct {
|
||||
@ -26,6 +27,11 @@ typedef struct {
|
||||
const char *source;
|
||||
size_t length;
|
||||
} constant;
|
||||
|
||||
struct {
|
||||
char *source;
|
||||
size_t length;
|
||||
} mapped;
|
||||
} as;
|
||||
} yp_string_t;
|
||||
|
||||
@ -38,6 +44,17 @@ void yp_string_owned_init(yp_string_t *string, char *source, size_t length);
|
||||
// Initialize a constant string that doesn't own its memory source.
|
||||
void yp_string_constant_init(yp_string_t *string, const char *source, size_t length);
|
||||
|
||||
// Read the file indicated by the filepath parameter into source and load its
|
||||
// contents and size into the given yp_string_t.
|
||||
// The given yp_string_t should be freed using yp_string_free() when it is no longer used.
|
||||
//
|
||||
// We want to use demand paging as much as possible in order to avoid having to
|
||||
// read the entire file into memory (which could be detrimental to performance
|
||||
// for large files). This means that if we're on windows we'll use
|
||||
// `MapViewOfFile`, on POSIX systems that have access to `mmap` we'll use
|
||||
// `mmap`, and on other POSIX systems we'll use `read`.
|
||||
bool yp_string_mapped_init(yp_string_t *string, const char *filepath);
|
||||
|
||||
// Returns the memory size associated with the string.
|
||||
size_t yp_string_memsize(const yp_string_t *string);
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user