[ruby/prism] Validate multibyte characters in strings
Check that multibyte characters are valid using pm_strpbrk. We need to add a couple of codepaths to ensure all encodings are covered. Importantly this doesn't check regular expressions, because apparently you're allowed to have invalid multibyte characters inside regular expression comment groups/extended mode. https://github.com/ruby/prism/commit/2857d3e1b5
This commit is contained in:
parent
dc5191d695
commit
2fa051f627
@ -2253,12 +2253,12 @@ static const uint8_t pm_utf_8_dfa[] = {
|
|||||||
static pm_unicode_codepoint_t
|
static pm_unicode_codepoint_t
|
||||||
pm_utf_8_codepoint(const uint8_t *b, ptrdiff_t n, size_t *width) {
|
pm_utf_8_codepoint(const uint8_t *b, ptrdiff_t n, size_t *width) {
|
||||||
assert(n >= 0);
|
assert(n >= 0);
|
||||||
size_t maximum = (size_t) n;
|
|
||||||
|
|
||||||
|
size_t maximum = (n > 4) ? 4 : ((size_t) n);
|
||||||
uint32_t codepoint;
|
uint32_t codepoint;
|
||||||
uint32_t state = 0;
|
uint32_t state = 0;
|
||||||
|
|
||||||
for (size_t index = 0; index < 4 && index < maximum; index++) {
|
for (size_t index = 0; index < maximum; index++) {
|
||||||
uint32_t byte = b[index];
|
uint32_t byte = b[index];
|
||||||
uint32_t type = pm_utf_8_dfa[byte];
|
uint32_t type = pm_utf_8_dfa[byte];
|
||||||
|
|
||||||
@ -2267,7 +2267,7 @@ pm_utf_8_codepoint(const uint8_t *b, ptrdiff_t n, size_t *width) {
|
|||||||
(0xffu >> type) & (byte);
|
(0xffu >> type) & (byte);
|
||||||
|
|
||||||
state = pm_utf_8_dfa[256 + (state * 16) + type];
|
state = pm_utf_8_dfa[256 + (state * 16) + type];
|
||||||
if (!state) {
|
if (state == 0) {
|
||||||
*width = index + 1;
|
*width = index + 1;
|
||||||
return (pm_unicode_codepoint_t) codepoint;
|
return (pm_unicode_codepoint_t) codepoint;
|
||||||
}
|
}
|
||||||
@ -2282,9 +2282,17 @@ pm_utf_8_codepoint(const uint8_t *b, ptrdiff_t n, size_t *width) {
|
|||||||
*/
|
*/
|
||||||
size_t
|
size_t
|
||||||
pm_encoding_utf_8_char_width(const uint8_t *b, ptrdiff_t n) {
|
pm_encoding_utf_8_char_width(const uint8_t *b, ptrdiff_t n) {
|
||||||
size_t width;
|
assert(n >= 0);
|
||||||
pm_utf_8_codepoint(b, n, &width);
|
|
||||||
return width;
|
size_t maximum = (n > 4) ? 4 : ((size_t) n);
|
||||||
|
uint32_t state = 0;
|
||||||
|
|
||||||
|
for (size_t index = 0; index < maximum; index++) {
|
||||||
|
state = pm_utf_8_dfa[256 + (state * 16) + pm_utf_8_dfa[b[index]]];
|
||||||
|
if (state == 0) return index + 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -245,6 +245,13 @@ extern const pm_encoding_t pm_encodings[PM_ENCODING_MAXIMUM];
|
|||||||
*/
|
*/
|
||||||
#define PM_ENCODING_US_ASCII_ENTRY (&pm_encodings[PM_ENCODING_US_ASCII])
|
#define PM_ENCODING_US_ASCII_ENTRY (&pm_encodings[PM_ENCODING_US_ASCII])
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This is the ASCII-8BIT encoding. We need a reference to it so that pm_strpbrk
|
||||||
|
* can compare against it because invalid multibyte characters are not a thing
|
||||||
|
* in this encoding.
|
||||||
|
*/
|
||||||
|
#define PM_ENCODING_ASCII_8BIT_ENTRY (&pm_encodings[PM_ENCODING_ASCII_8BIT])
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Parse the given name of an encoding and return a pointer to the corresponding
|
* Parse the given name of an encoding and return a pointer to the corresponding
|
||||||
* encoding struct if one can be found, otherwise return NULL.
|
* encoding struct if one can be found, otherwise return NULL.
|
||||||
|
@ -9737,7 +9737,7 @@ parser_lex(pm_parser_t *parser) {
|
|||||||
// and then find the first one.
|
// and then find the first one.
|
||||||
pm_lex_mode_t *lex_mode = parser->lex_modes.current;
|
pm_lex_mode_t *lex_mode = parser->lex_modes.current;
|
||||||
const uint8_t *breakpoints = lex_mode->as.list.breakpoints;
|
const uint8_t *breakpoints = lex_mode->as.list.breakpoints;
|
||||||
const uint8_t *breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end);
|
const uint8_t *breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end, true);
|
||||||
|
|
||||||
// If we haven't found an escape yet, then this buffer will be
|
// If we haven't found an escape yet, then this buffer will be
|
||||||
// unallocated since we can refer directly to the source string.
|
// unallocated since we can refer directly to the source string.
|
||||||
@ -9746,7 +9746,7 @@ parser_lex(pm_parser_t *parser) {
|
|||||||
while (breakpoint != NULL) {
|
while (breakpoint != NULL) {
|
||||||
// If we hit a null byte, skip directly past it.
|
// If we hit a null byte, skip directly past it.
|
||||||
if (*breakpoint == '\0') {
|
if (*breakpoint == '\0') {
|
||||||
breakpoint = pm_strpbrk(parser, breakpoint + 1, breakpoints, parser->end - (breakpoint + 1));
|
breakpoint = pm_strpbrk(parser, breakpoint + 1, breakpoints, parser->end - (breakpoint + 1), true);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -9765,7 +9765,7 @@ parser_lex(pm_parser_t *parser) {
|
|||||||
// we need to continue on past it.
|
// we need to continue on past it.
|
||||||
if (lex_mode->as.list.nesting > 0) {
|
if (lex_mode->as.list.nesting > 0) {
|
||||||
parser->current.end = breakpoint + 1;
|
parser->current.end = breakpoint + 1;
|
||||||
breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end);
|
breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end, true);
|
||||||
lex_mode->as.list.nesting--;
|
lex_mode->as.list.nesting--;
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
@ -9850,7 +9850,7 @@ parser_lex(pm_parser_t *parser) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
token_buffer.cursor = parser->current.end;
|
token_buffer.cursor = parser->current.end;
|
||||||
breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end);
|
breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end, true);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -9863,7 +9863,7 @@ parser_lex(pm_parser_t *parser) {
|
|||||||
// that looked like an interpolated class or instance variable
|
// that looked like an interpolated class or instance variable
|
||||||
// like "#@" but wasn't actually. In this case we'll just skip
|
// like "#@" but wasn't actually. In this case we'll just skip
|
||||||
// to the next breakpoint.
|
// to the next breakpoint.
|
||||||
breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end);
|
breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end, true);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -9878,7 +9878,7 @@ parser_lex(pm_parser_t *parser) {
|
|||||||
// and find the next breakpoint.
|
// and find the next breakpoint.
|
||||||
assert(*breakpoint == lex_mode->as.list.incrementor);
|
assert(*breakpoint == lex_mode->as.list.incrementor);
|
||||||
parser->current.end = breakpoint + 1;
|
parser->current.end = breakpoint + 1;
|
||||||
breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end);
|
breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end, true);
|
||||||
lex_mode->as.list.nesting++;
|
lex_mode->as.list.nesting++;
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
@ -9917,14 +9917,14 @@ parser_lex(pm_parser_t *parser) {
|
|||||||
// regular expression. We'll use strpbrk to find the first of these
|
// regular expression. We'll use strpbrk to find the first of these
|
||||||
// characters.
|
// characters.
|
||||||
const uint8_t *breakpoints = lex_mode->as.regexp.breakpoints;
|
const uint8_t *breakpoints = lex_mode->as.regexp.breakpoints;
|
||||||
const uint8_t *breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end);
|
const uint8_t *breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end, false);
|
||||||
pm_token_buffer_t token_buffer = { { 0 }, 0 };
|
pm_token_buffer_t token_buffer = { { 0 }, 0 };
|
||||||
|
|
||||||
while (breakpoint != NULL) {
|
while (breakpoint != NULL) {
|
||||||
// If we hit a null byte, skip directly past it.
|
// If we hit a null byte, skip directly past it.
|
||||||
if (*breakpoint == '\0') {
|
if (*breakpoint == '\0') {
|
||||||
parser->current.end = breakpoint + 1;
|
parser->current.end = breakpoint + 1;
|
||||||
breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end);
|
breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end, false);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -9946,7 +9946,7 @@ parser_lex(pm_parser_t *parser) {
|
|||||||
// If the terminator is not a newline, then we can set
|
// If the terminator is not a newline, then we can set
|
||||||
// the next breakpoint and continue.
|
// the next breakpoint and continue.
|
||||||
parser->current.end = breakpoint + 1;
|
parser->current.end = breakpoint + 1;
|
||||||
breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end);
|
breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end, false);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -9956,7 +9956,7 @@ parser_lex(pm_parser_t *parser) {
|
|||||||
if (*breakpoint == lex_mode->as.regexp.terminator) {
|
if (*breakpoint == lex_mode->as.regexp.terminator) {
|
||||||
if (lex_mode->as.regexp.nesting > 0) {
|
if (lex_mode->as.regexp.nesting > 0) {
|
||||||
parser->current.end = breakpoint + 1;
|
parser->current.end = breakpoint + 1;
|
||||||
breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end);
|
breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end, false);
|
||||||
lex_mode->as.regexp.nesting--;
|
lex_mode->as.regexp.nesting--;
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
@ -10055,7 +10055,7 @@ parser_lex(pm_parser_t *parser) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
token_buffer.cursor = parser->current.end;
|
token_buffer.cursor = parser->current.end;
|
||||||
breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end);
|
breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end, false);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -10068,7 +10068,7 @@ parser_lex(pm_parser_t *parser) {
|
|||||||
// something that looked like an interpolated class or
|
// something that looked like an interpolated class or
|
||||||
// instance variable like "#@" but wasn't actually. In
|
// instance variable like "#@" but wasn't actually. In
|
||||||
// this case we'll just skip to the next breakpoint.
|
// this case we'll just skip to the next breakpoint.
|
||||||
breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end);
|
breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end, false);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -10083,7 +10083,7 @@ parser_lex(pm_parser_t *parser) {
|
|||||||
// and find the next breakpoint.
|
// and find the next breakpoint.
|
||||||
assert(*breakpoint == lex_mode->as.regexp.incrementor);
|
assert(*breakpoint == lex_mode->as.regexp.incrementor);
|
||||||
parser->current.end = breakpoint + 1;
|
parser->current.end = breakpoint + 1;
|
||||||
breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end);
|
breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end, false);
|
||||||
lex_mode->as.regexp.nesting++;
|
lex_mode->as.regexp.nesting++;
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
@ -10119,7 +10119,7 @@ parser_lex(pm_parser_t *parser) {
|
|||||||
// string. We'll use strpbrk to find the first of these characters.
|
// string. We'll use strpbrk to find the first of these characters.
|
||||||
pm_lex_mode_t *lex_mode = parser->lex_modes.current;
|
pm_lex_mode_t *lex_mode = parser->lex_modes.current;
|
||||||
const uint8_t *breakpoints = lex_mode->as.string.breakpoints;
|
const uint8_t *breakpoints = lex_mode->as.string.breakpoints;
|
||||||
const uint8_t *breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end);
|
const uint8_t *breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end, true);
|
||||||
|
|
||||||
// If we haven't found an escape yet, then this buffer will be
|
// If we haven't found an escape yet, then this buffer will be
|
||||||
// unallocated since we can refer directly to the source string.
|
// unallocated since we can refer directly to the source string.
|
||||||
@ -10131,7 +10131,7 @@ parser_lex(pm_parser_t *parser) {
|
|||||||
if (lex_mode->as.string.incrementor != '\0' && *breakpoint == lex_mode->as.string.incrementor) {
|
if (lex_mode->as.string.incrementor != '\0' && *breakpoint == lex_mode->as.string.incrementor) {
|
||||||
lex_mode->as.string.nesting++;
|
lex_mode->as.string.nesting++;
|
||||||
parser->current.end = breakpoint + 1;
|
parser->current.end = breakpoint + 1;
|
||||||
breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end);
|
breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end, true);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -10143,7 +10143,7 @@ parser_lex(pm_parser_t *parser) {
|
|||||||
// to continue on past it.
|
// to continue on past it.
|
||||||
if (lex_mode->as.string.nesting > 0) {
|
if (lex_mode->as.string.nesting > 0) {
|
||||||
parser->current.end = breakpoint + 1;
|
parser->current.end = breakpoint + 1;
|
||||||
breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end);
|
breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end, true);
|
||||||
lex_mode->as.string.nesting--;
|
lex_mode->as.string.nesting--;
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
@ -10185,7 +10185,7 @@ parser_lex(pm_parser_t *parser) {
|
|||||||
if (parser->heredoc_end == NULL) {
|
if (parser->heredoc_end == NULL) {
|
||||||
pm_newline_list_append(&parser->newline_list, breakpoint);
|
pm_newline_list_append(&parser->newline_list, breakpoint);
|
||||||
parser->current.end = breakpoint + 1;
|
parser->current.end = breakpoint + 1;
|
||||||
breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end);
|
breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end, true);
|
||||||
continue;
|
continue;
|
||||||
} else {
|
} else {
|
||||||
parser->current.end = breakpoint + 1;
|
parser->current.end = breakpoint + 1;
|
||||||
@ -10199,7 +10199,7 @@ parser_lex(pm_parser_t *parser) {
|
|||||||
case '\0':
|
case '\0':
|
||||||
// Skip directly past the null character.
|
// Skip directly past the null character.
|
||||||
parser->current.end = breakpoint + 1;
|
parser->current.end = breakpoint + 1;
|
||||||
breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end);
|
breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end, true);
|
||||||
break;
|
break;
|
||||||
case '\\': {
|
case '\\': {
|
||||||
// Here we hit escapes.
|
// Here we hit escapes.
|
||||||
@ -10268,7 +10268,7 @@ parser_lex(pm_parser_t *parser) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
token_buffer.cursor = parser->current.end;
|
token_buffer.cursor = parser->current.end;
|
||||||
breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end);
|
breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end, true);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
case '#': {
|
case '#': {
|
||||||
@ -10279,7 +10279,7 @@ parser_lex(pm_parser_t *parser) {
|
|||||||
// looked like an interpolated class or instance variable like "#@"
|
// looked like an interpolated class or instance variable like "#@"
|
||||||
// but wasn't actually. In this case we'll just skip to the next
|
// but wasn't actually. In this case we'll just skip to the next
|
||||||
// breakpoint.
|
// breakpoint.
|
||||||
breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end);
|
breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end, true);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -10407,7 +10407,7 @@ parser_lex(pm_parser_t *parser) {
|
|||||||
breakpoints[2] = '\0';
|
breakpoints[2] = '\0';
|
||||||
}
|
}
|
||||||
|
|
||||||
const uint8_t *breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end);
|
const uint8_t *breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end, true);
|
||||||
pm_token_buffer_t token_buffer = { { 0 }, 0 };
|
pm_token_buffer_t token_buffer = { { 0 }, 0 };
|
||||||
bool was_escaped_newline = false;
|
bool was_escaped_newline = false;
|
||||||
|
|
||||||
@ -10416,7 +10416,7 @@ parser_lex(pm_parser_t *parser) {
|
|||||||
case '\0':
|
case '\0':
|
||||||
// Skip directly past the null character.
|
// Skip directly past the null character.
|
||||||
parser->current.end = breakpoint + 1;
|
parser->current.end = breakpoint + 1;
|
||||||
breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end);
|
breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end, true);
|
||||||
break;
|
break;
|
||||||
case '\n': {
|
case '\n': {
|
||||||
if (parser->heredoc_end != NULL && (parser->heredoc_end > breakpoint)) {
|
if (parser->heredoc_end != NULL && (parser->heredoc_end > breakpoint)) {
|
||||||
@ -10491,7 +10491,7 @@ parser_lex(pm_parser_t *parser) {
|
|||||||
// Otherwise we hit a newline and it wasn't followed by
|
// Otherwise we hit a newline and it wasn't followed by
|
||||||
// a terminator, so we can continue parsing.
|
// a terminator, so we can continue parsing.
|
||||||
parser->current.end = breakpoint + 1;
|
parser->current.end = breakpoint + 1;
|
||||||
breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end);
|
breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end, true);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
case '\\': {
|
case '\\': {
|
||||||
@ -10555,7 +10555,7 @@ parser_lex(pm_parser_t *parser) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
token_buffer.cursor = parser->current.end;
|
token_buffer.cursor = parser->current.end;
|
||||||
breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end);
|
breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end, true);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
case '#': {
|
case '#': {
|
||||||
@ -10567,7 +10567,7 @@ parser_lex(pm_parser_t *parser) {
|
|||||||
// or instance variable like "#@" but wasn't
|
// or instance variable like "#@" but wasn't
|
||||||
// actually. In this case we'll just skip to the
|
// actually. In this case we'll just skip to the
|
||||||
// next breakpoint.
|
// next breakpoint.
|
||||||
breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end);
|
breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end, true);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1,10 +1,18 @@
|
|||||||
#include "prism/util/pm_strpbrk.h"
|
#include "prism/util/pm_strpbrk.h"
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* This is the slow path that does care about the encoding.
|
* Add an invalid multibyte character error to the parser.
|
||||||
|
*/
|
||||||
|
static inline void
|
||||||
|
pm_strpbrk_invalid_multibyte_character(pm_parser_t *parser, const uint8_t *start, const uint8_t *end) {
|
||||||
|
pm_diagnostic_list_append_format(&parser->error_list, start, end, PM_ERR_INVALID_MULTIBYTE_CHARACTER, *start);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This is the default path.
|
||||||
*/
|
*/
|
||||||
static inline const uint8_t *
|
static inline const uint8_t *
|
||||||
pm_strpbrk_multi_byte(const pm_parser_t *parser, const uint8_t *source, const uint8_t *charset, size_t maximum) {
|
pm_strpbrk_utf8(pm_parser_t *parser, const uint8_t *source, const uint8_t *charset, size_t maximum, bool validate) {
|
||||||
size_t index = 0;
|
size_t index = 0;
|
||||||
|
|
||||||
while (index < maximum) {
|
while (index < maximum) {
|
||||||
@ -12,22 +20,39 @@ pm_strpbrk_multi_byte(const pm_parser_t *parser, const uint8_t *source, const ui
|
|||||||
return source + index;
|
return source + index;
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t width = parser->encoding->char_width(source + index, (ptrdiff_t) (maximum - index));
|
if (source[index] < 0x80) {
|
||||||
if (width == 0) {
|
index++;
|
||||||
return NULL;
|
} else {
|
||||||
}
|
size_t width = pm_encoding_utf_8_char_width(source + index, (ptrdiff_t) (maximum - index));
|
||||||
|
|
||||||
|
if (width > 0) {
|
||||||
index += width;
|
index += width;
|
||||||
|
} else if (!validate) {
|
||||||
|
index++;
|
||||||
|
} else {
|
||||||
|
// At this point we know we have an invalid multibyte character.
|
||||||
|
// We'll walk forward as far as we can until we find the next
|
||||||
|
// valid character so that we don't spam the user with a ton of
|
||||||
|
// the same kind of error.
|
||||||
|
const size_t start = index;
|
||||||
|
|
||||||
|
do {
|
||||||
|
index++;
|
||||||
|
} while (index < maximum && pm_encoding_utf_8_char_width(source + index, (ptrdiff_t) (maximum - index)) == 0);
|
||||||
|
|
||||||
|
pm_strpbrk_invalid_multibyte_character(parser, source + start, source + index);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* This is the fast path that does not care about the encoding.
|
* This is the path when the encoding is ASCII-8BIT.
|
||||||
*/
|
*/
|
||||||
static inline const uint8_t *
|
static inline const uint8_t *
|
||||||
pm_strpbrk_single_byte(const uint8_t *source, const uint8_t *charset, size_t maximum) {
|
pm_strpbrk_ascii_8bit(const uint8_t *source, const uint8_t *charset, size_t maximum) {
|
||||||
size_t index = 0;
|
size_t index = 0;
|
||||||
|
|
||||||
while (index < maximum) {
|
while (index < maximum) {
|
||||||
@ -41,6 +66,85 @@ pm_strpbrk_single_byte(const uint8_t *source, const uint8_t *charset, size_t max
|
|||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This is the slow path that does care about the encoding.
|
||||||
|
*/
|
||||||
|
static inline const uint8_t *
|
||||||
|
pm_strpbrk_multi_byte(pm_parser_t *parser, const uint8_t *source, const uint8_t *charset, size_t maximum, bool validate) {
|
||||||
|
size_t index = 0;
|
||||||
|
|
||||||
|
while (index < maximum) {
|
||||||
|
if (strchr((const char *) charset, source[index]) != NULL) {
|
||||||
|
return source + index;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (source[index] < 0x80) {
|
||||||
|
index++;
|
||||||
|
} else {
|
||||||
|
size_t width = parser->encoding->char_width(source + index, (ptrdiff_t) (maximum - index));
|
||||||
|
|
||||||
|
if (width > 0) {
|
||||||
|
index += width;
|
||||||
|
} else if (!validate) {
|
||||||
|
index++;
|
||||||
|
} else {
|
||||||
|
// At this point we know we have an invalid multibyte character.
|
||||||
|
// We'll walk forward as far as we can until we find the next
|
||||||
|
// valid character so that we don't spam the user with a ton of
|
||||||
|
// the same kind of error.
|
||||||
|
const size_t start = index;
|
||||||
|
|
||||||
|
do {
|
||||||
|
index++;
|
||||||
|
} while (index < maximum && parser->encoding->char_width(source + index, (ptrdiff_t) (maximum - index)) == 0);
|
||||||
|
|
||||||
|
pm_strpbrk_invalid_multibyte_character(parser, source + start, source + index);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This is the fast path that does not care about the encoding because we know
|
||||||
|
* the encoding only supports single-byte characters.
|
||||||
|
*/
|
||||||
|
static inline const uint8_t *
|
||||||
|
pm_strpbrk_single_byte(pm_parser_t *parser, const uint8_t *source, const uint8_t *charset, size_t maximum, bool validate) {
|
||||||
|
size_t index = 0;
|
||||||
|
|
||||||
|
while (index < maximum) {
|
||||||
|
if (strchr((const char *) charset, source[index]) != NULL) {
|
||||||
|
return source + index;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (source[index] < 0x80 || !validate) {
|
||||||
|
index++;
|
||||||
|
} else {
|
||||||
|
size_t width = parser->encoding->char_width(source + index, (ptrdiff_t) (maximum - index));
|
||||||
|
|
||||||
|
if (width > 0) {
|
||||||
|
index += width;
|
||||||
|
} else {
|
||||||
|
// At this point we know we have an invalid multibyte character.
|
||||||
|
// We'll walk forward as far as we can until we find the next
|
||||||
|
// valid character so that we don't spam the user with a ton of
|
||||||
|
// the same kind of error.
|
||||||
|
const size_t start = index;
|
||||||
|
|
||||||
|
do {
|
||||||
|
index++;
|
||||||
|
} while (index < maximum && parser->encoding->char_width(source + index, (ptrdiff_t) (maximum - index)) == 0);
|
||||||
|
|
||||||
|
pm_strpbrk_invalid_multibyte_character(parser, source + start, source + index);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Here we have rolled our own version of strpbrk. The standard library strpbrk
|
* Here we have rolled our own version of strpbrk. The standard library strpbrk
|
||||||
* has undefined behavior when the source string is not null-terminated. We want
|
* has undefined behavior when the source string is not null-terminated. We want
|
||||||
@ -57,16 +161,20 @@ pm_strpbrk_single_byte(const uint8_t *source, const uint8_t *charset, size_t max
|
|||||||
*
|
*
|
||||||
* Finally, we want to support encodings wherein the charset could contain
|
* Finally, we want to support encodings wherein the charset could contain
|
||||||
* characters that are trailing bytes of multi-byte characters. For example, in
|
* characters that are trailing bytes of multi-byte characters. For example, in
|
||||||
* Shift-JIS, the backslash character can be a trailing byte. In that case we
|
* Shift_JIS, the backslash character can be a trailing byte. In that case we
|
||||||
* need to take a slower path and iterate one multi-byte character at a time.
|
* need to take a slower path and iterate one multi-byte character at a time.
|
||||||
*/
|
*/
|
||||||
const uint8_t *
|
const uint8_t *
|
||||||
pm_strpbrk(const pm_parser_t *parser, const uint8_t *source, const uint8_t *charset, ptrdiff_t length) {
|
pm_strpbrk(pm_parser_t *parser, const uint8_t *source, const uint8_t *charset, ptrdiff_t length, bool validate) {
|
||||||
if (length <= 0) {
|
if (length <= 0) {
|
||||||
return NULL;
|
return NULL;
|
||||||
} else if (parser->encoding_changed && parser->encoding->multibyte) {
|
} else if (!parser->encoding_changed) {
|
||||||
return pm_strpbrk_multi_byte(parser, source, charset, (size_t) length);
|
return pm_strpbrk_utf8(parser, source, charset, (size_t) length, validate);
|
||||||
|
} else if (parser->encoding == PM_ENCODING_ASCII_8BIT_ENTRY) {
|
||||||
|
return pm_strpbrk_ascii_8bit(source, charset, (size_t) length);
|
||||||
|
} else if (parser->encoding->multibyte) {
|
||||||
|
return pm_strpbrk_multi_byte(parser, source, charset, (size_t) length, validate);
|
||||||
} else {
|
} else {
|
||||||
return pm_strpbrk_single_byte(source, charset, (size_t) length);
|
return pm_strpbrk_single_byte(parser, source, charset, (size_t) length, validate);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -7,6 +7,7 @@
|
|||||||
#define PRISM_STRPBRK_H
|
#define PRISM_STRPBRK_H
|
||||||
|
|
||||||
#include "prism/defines.h"
|
#include "prism/defines.h"
|
||||||
|
#include "prism/diagnostic.h"
|
||||||
#include "prism/parser.h"
|
#include "prism/parser.h"
|
||||||
|
|
||||||
#include <stddef.h>
|
#include <stddef.h>
|
||||||
@ -35,9 +36,11 @@
|
|||||||
* @param source The source to search.
|
* @param source The source to search.
|
||||||
* @param charset The charset to search for.
|
* @param charset The charset to search for.
|
||||||
* @param length The maximum number of bytes to search.
|
* @param length The maximum number of bytes to search.
|
||||||
|
* @param validate Whether to validate that the source string is valid in the
|
||||||
|
* current encoding of the parser.
|
||||||
* @return A pointer to the first character in the source string that is in the
|
* @return A pointer to the first character in the source string that is in the
|
||||||
* charset, or NULL if no such character exists.
|
* charset, or NULL if no such character exists.
|
||||||
*/
|
*/
|
||||||
const uint8_t * pm_strpbrk(const pm_parser_t *parser, const uint8_t *source, const uint8_t *charset, ptrdiff_t length);
|
const uint8_t * pm_strpbrk(pm_parser_t *parser, const uint8_t *source, const uint8_t *charset, ptrdiff_t length, bool validate);
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
Loading…
x
Reference in New Issue
Block a user