[ruby/yarp] Switch from handling const char * to const uint8_t *
https://github.com/ruby/yarp/commit/465e7bb0a9
This commit is contained in:
parent
eac3da173a
commit
7be08f3f58
@ -6,6 +6,7 @@
|
||||
#include <ctype.h>
|
||||
#include <stdarg.h>
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
|
||||
@ -39,6 +40,6 @@
|
||||
# define snprintf _snprintf
|
||||
#endif
|
||||
|
||||
int yp_strncasecmp(const char *string1, const char *string2, size_t length);
|
||||
int yp_strncasecmp(const uint8_t *string1, const uint8_t *string2, size_t length);
|
||||
|
||||
#endif
|
||||
|
@ -2,7 +2,7 @@
|
||||
|
||||
// Append an error to the given list of diagnostic.
|
||||
bool
|
||||
yp_diagnostic_list_append(yp_list_t *list, const char *start, const char *end, const char *message) {
|
||||
yp_diagnostic_list_append(yp_list_t *list, const uint8_t *start, const uint8_t *end, const char *message) {
|
||||
yp_diagnostic_t *diagnostic = (yp_diagnostic_t *) malloc(sizeof(yp_diagnostic_t));
|
||||
if (diagnostic == NULL) return false;
|
||||
|
||||
|
@ -10,13 +10,13 @@
|
||||
// This struct represents a diagnostic found during parsing.
|
||||
typedef struct {
|
||||
yp_list_node_t node;
|
||||
const char *start;
|
||||
const char *end;
|
||||
const uint8_t *start;
|
||||
const uint8_t *end;
|
||||
const char *message;
|
||||
} yp_diagnostic_t;
|
||||
|
||||
// Append a diagnostic to the given list of diagnostics.
|
||||
bool yp_diagnostic_list_append(yp_list_t *list, const char *start, const char *end, const char *message);
|
||||
bool yp_diagnostic_list_append(yp_list_t *list, const uint8_t *start, const uint8_t *end, const char *message);
|
||||
|
||||
// Deallocate the internal state of the given diagnostic list.
|
||||
void yp_diagnostic_list_free(yp_list_t *list);
|
||||
|
@ -1,69 +1,42 @@
|
||||
#include "yarp/enc/yp_encoding.h"
|
||||
|
||||
typedef uint16_t yp_big5_codepoint_t;
|
||||
|
||||
static yp_big5_codepoint_t
|
||||
yp_big5_codepoint(const char *c, ptrdiff_t n, size_t *width) {
|
||||
const unsigned char *uc = (const unsigned char *) c;
|
||||
|
||||
static size_t
|
||||
yp_encoding_big5_char_width(const uint8_t *b, ptrdiff_t n) {
|
||||
// These are the single byte characters.
|
||||
if (*uc < 0x80) {
|
||||
*width = 1;
|
||||
return *uc;
|
||||
if (*b < 0x80) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
// These are the double byte characters.
|
||||
if ((n > 1) && (uc[0] >= 0xA1 && uc[0] <= 0xFE) && (uc[1] >= 0x40 && uc[1] <= 0xFE)) {
|
||||
*width = 2;
|
||||
return (yp_big5_codepoint_t) (uc[0] << 8 | uc[1]);
|
||||
if ((n > 1) && (b[0] >= 0xA1 && b[0] <= 0xFE) && (b[1] >= 0x40 && b[1] <= 0xFE)) {
|
||||
return 2;
|
||||
}
|
||||
|
||||
*width = 0;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static size_t
|
||||
yp_encoding_big5_char_width(const char *c, ptrdiff_t n) {
|
||||
size_t width;
|
||||
yp_big5_codepoint(c, n, &width);
|
||||
|
||||
return width;
|
||||
}
|
||||
|
||||
static size_t
|
||||
yp_encoding_big5_alpha_char(const char *c, ptrdiff_t n) {
|
||||
size_t width;
|
||||
yp_big5_codepoint_t codepoint = yp_big5_codepoint(c, n, &width);
|
||||
|
||||
if (width == 1) {
|
||||
const char value = (const char) codepoint;
|
||||
return yp_encoding_ascii_alpha_char(&value, n);
|
||||
yp_encoding_big5_alpha_char(const uint8_t *b, ptrdiff_t n) {
|
||||
if (yp_encoding_big5_char_width(b, n) == 1) {
|
||||
return yp_encoding_ascii_alpha_char(b, n);
|
||||
} else {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
static size_t
|
||||
yp_encoding_big5_alnum_char(const char *c, ptrdiff_t n) {
|
||||
size_t width;
|
||||
yp_big5_codepoint_t codepoint = yp_big5_codepoint(c, n, &width);
|
||||
|
||||
if (width == 1) {
|
||||
const char value = (const char) codepoint;
|
||||
return yp_encoding_ascii_alnum_char(&value, n);
|
||||
yp_encoding_big5_alnum_char(const uint8_t *b, ptrdiff_t n) {
|
||||
if (yp_encoding_big5_char_width(b, n) == 1) {
|
||||
return yp_encoding_ascii_alnum_char(b, n);
|
||||
} else {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
static bool
|
||||
yp_encoding_big5_isupper_char(const char *c, ptrdiff_t n) {
|
||||
size_t width;
|
||||
yp_big5_codepoint_t codepoint = yp_big5_codepoint(c, n, &width);
|
||||
|
||||
if (width == 1) {
|
||||
const char value = (const char) codepoint;
|
||||
return yp_encoding_ascii_isupper_char(&value, n);
|
||||
yp_encoding_big5_isupper_char(const uint8_t *b, ptrdiff_t n) {
|
||||
if (yp_encoding_big5_char_width(b, n) == 1) {
|
||||
return yp_encoding_ascii_isupper_char(b, n);
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
|
@ -16,22 +16,22 @@ typedef struct {
|
||||
// Return the number of bytes that the next character takes if it is valid
|
||||
// in the encoding. Does not read more than n bytes. It is assumed that n is
|
||||
// at least 1.
|
||||
size_t (*char_width)(const char *c, ptrdiff_t n);
|
||||
size_t (*char_width)(const uint8_t *b, ptrdiff_t n);
|
||||
|
||||
// Return the number of bytes that the next character takes if it is valid
|
||||
// in the encoding and is alphabetical. Does not read more than n bytes. It
|
||||
// is assumed that n is at least 1.
|
||||
size_t (*alpha_char)(const char *c, ptrdiff_t n);
|
||||
size_t (*alpha_char)(const uint8_t *b, ptrdiff_t n);
|
||||
|
||||
// Return the number of bytes that the next character takes if it is valid
|
||||
// in the encoding and is alphanumeric. Does not read more than n bytes. It
|
||||
// is assumed that n is at least 1.
|
||||
size_t (*alnum_char)(const char *c, ptrdiff_t n);
|
||||
size_t (*alnum_char)(const uint8_t *b, ptrdiff_t n);
|
||||
|
||||
// Return true if the next character is valid in the encoding and is an
|
||||
// uppercase character. Does not read more than n bytes. It is assumed that
|
||||
// n is at least 1.
|
||||
bool (*isupper_char)(const char *c, ptrdiff_t n);
|
||||
bool (*isupper_char)(const uint8_t *b, ptrdiff_t n);
|
||||
|
||||
// The name of the encoding. This should correspond to a value that can be
|
||||
// passed to Encoding.find in Ruby.
|
||||
@ -49,18 +49,18 @@ typedef struct {
|
||||
|
||||
// These functions are reused by some other encodings, so they are defined here
|
||||
// so they can be shared.
|
||||
size_t yp_encoding_ascii_alpha_char(const char *c, YP_ATTRIBUTE_UNUSED ptrdiff_t n);
|
||||
size_t yp_encoding_ascii_alnum_char(const char *c, YP_ATTRIBUTE_UNUSED ptrdiff_t n);
|
||||
bool yp_encoding_ascii_isupper_char(const char *c, YP_ATTRIBUTE_UNUSED ptrdiff_t n);
|
||||
size_t yp_encoding_ascii_alpha_char(const uint8_t *b, YP_ATTRIBUTE_UNUSED ptrdiff_t n);
|
||||
size_t yp_encoding_ascii_alnum_char(const uint8_t *b, YP_ATTRIBUTE_UNUSED ptrdiff_t n);
|
||||
bool yp_encoding_ascii_isupper_char(const uint8_t *b, YP_ATTRIBUTE_UNUSED ptrdiff_t n);
|
||||
|
||||
// These functions are shared between the actual encoding and the fast path in
|
||||
// the parser so they need to be internally visible.
|
||||
size_t yp_encoding_utf_8_alpha_char(const char *c, ptrdiff_t n);
|
||||
size_t yp_encoding_utf_8_alnum_char(const char *c, ptrdiff_t n);
|
||||
size_t yp_encoding_utf_8_alpha_char(const uint8_t *b, ptrdiff_t n);
|
||||
size_t yp_encoding_utf_8_alnum_char(const uint8_t *b, ptrdiff_t n);
|
||||
|
||||
// This lookup table is referenced in both the UTF-8 encoding file and the
|
||||
// parser directly in order to speed up the default encoding processing.
|
||||
extern unsigned char yp_encoding_unicode_table[256];
|
||||
extern uint8_t yp_encoding_unicode_table[256];
|
||||
|
||||
// These are the encodings that are supported by the parser. They are defined in
|
||||
// their own files in the src/enc directory.
|
||||
|
@ -1,75 +1,48 @@
|
||||
#include "yarp/enc/yp_encoding.h"
|
||||
|
||||
typedef uint16_t yp_euc_jp_codepoint_t;
|
||||
|
||||
static yp_euc_jp_codepoint_t
|
||||
yp_euc_jp_codepoint(const char *c, ptrdiff_t n, size_t *width) {
|
||||
const unsigned char *uc = (const unsigned char *) c;
|
||||
|
||||
static size_t
|
||||
yp_encoding_euc_jp_char_width(const uint8_t *b, ptrdiff_t n) {
|
||||
// These are the single byte characters.
|
||||
if (*uc < 0x80) {
|
||||
*width = 1;
|
||||
return *uc;
|
||||
if (*b < 0x80) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
// These are the double byte characters.
|
||||
if (
|
||||
(n > 1) &&
|
||||
(
|
||||
((uc[0] == 0x8E) && (uc[1] >= 0xA1 && uc[1] <= 0xFE)) ||
|
||||
((uc[0] >= 0xA1 && uc[0] <= 0xFE) && (uc[1] >= 0xA1 && uc[1] <= 0xFE))
|
||||
((b[0] == 0x8E) && (b[1] >= 0xA1 && b[1] <= 0xFE)) ||
|
||||
((b[0] >= 0xA1 && b[0] <= 0xFE) && (b[1] >= 0xA1 && b[1] <= 0xFE))
|
||||
)
|
||||
) {
|
||||
*width = 2;
|
||||
return (yp_euc_jp_codepoint_t) (uc[0] << 8 | uc[1]);
|
||||
return 2;
|
||||
}
|
||||
|
||||
*width = 0;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static size_t
|
||||
yp_encoding_euc_jp_char_width(const char *c, ptrdiff_t n) {
|
||||
size_t width;
|
||||
yp_euc_jp_codepoint(c, n, &width);
|
||||
|
||||
return width;
|
||||
}
|
||||
|
||||
static size_t
|
||||
yp_encoding_euc_jp_alpha_char(const char *c, ptrdiff_t n) {
|
||||
size_t width;
|
||||
yp_euc_jp_codepoint_t codepoint = yp_euc_jp_codepoint(c, n, &width);
|
||||
|
||||
if (width == 1) {
|
||||
const char value = (const char) codepoint;
|
||||
return yp_encoding_ascii_alpha_char(&value, n);
|
||||
yp_encoding_euc_jp_alpha_char(const uint8_t *b, ptrdiff_t n) {
|
||||
if (yp_encoding_euc_jp_char_width(b, n) == 1) {
|
||||
return yp_encoding_ascii_alpha_char(b, n);
|
||||
} else {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
static size_t
|
||||
yp_encoding_euc_jp_alnum_char(const char *c, ptrdiff_t n) {
|
||||
size_t width;
|
||||
yp_euc_jp_codepoint_t codepoint = yp_euc_jp_codepoint(c, n, &width);
|
||||
|
||||
if (width == 1) {
|
||||
const char value = (const char) codepoint;
|
||||
return yp_encoding_ascii_alnum_char(&value, n);
|
||||
yp_encoding_euc_jp_alnum_char(const uint8_t *b, ptrdiff_t n) {
|
||||
if (yp_encoding_euc_jp_char_width(b, n) == 1) {
|
||||
return yp_encoding_ascii_alnum_char(b, n);
|
||||
} else {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
static bool
|
||||
yp_encoding_euc_jp_isupper_char(const char *c, ptrdiff_t n) {
|
||||
size_t width;
|
||||
yp_euc_jp_codepoint_t codepoint = yp_euc_jp_codepoint(c, n, &width);
|
||||
|
||||
if (width == 1) {
|
||||
const char value = (const char) codepoint;
|
||||
return yp_encoding_ascii_isupper_char(&value, n);
|
||||
yp_encoding_euc_jp_isupper_char(const uint8_t *b, ptrdiff_t n) {
|
||||
if (yp_encoding_euc_jp_char_width(b, n) == 1) {
|
||||
return yp_encoding_ascii_isupper_char(b, n);
|
||||
} else {
|
||||
return 0;
|
||||
}
|
||||
|
@ -1,78 +1,51 @@
|
||||
#include "yarp/enc/yp_encoding.h"
|
||||
|
||||
typedef uint16_t yp_gbk_codepoint_t;
|
||||
|
||||
static yp_gbk_codepoint_t
|
||||
yp_gbk_codepoint(const char *c, ptrdiff_t n, size_t *width) {
|
||||
const unsigned char *uc = (const unsigned char *) c;
|
||||
|
||||
static size_t
|
||||
yp_encoding_gbk_char_width(const uint8_t *b, ptrdiff_t n) {
|
||||
// These are the single byte characters.
|
||||
if (*uc < 0x80) {
|
||||
*width = 1;
|
||||
return *uc;
|
||||
if (*b < 0x80) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
// These are the double byte characters.
|
||||
if (
|
||||
(n > 1) &&
|
||||
(
|
||||
((uc[0] >= 0xA1 && uc[0] <= 0xA9) && (uc[1] >= 0xA1 && uc[1] <= 0xFE)) || // GBK/1
|
||||
((uc[0] >= 0xB0 && uc[0] <= 0xF7) && (uc[1] >= 0xA1 && uc[1] <= 0xFE)) || // GBK/2
|
||||
((uc[0] >= 0x81 && uc[0] <= 0xA0) && (uc[1] >= 0x40 && uc[1] <= 0xFE) && (uc[1] != 0x7F)) || // GBK/3
|
||||
((uc[0] >= 0xAA && uc[0] <= 0xFE) && (uc[1] >= 0x40 && uc[1] <= 0xA0) && (uc[1] != 0x7F)) || // GBK/4
|
||||
((uc[0] >= 0xA8 && uc[0] <= 0xA9) && (uc[1] >= 0x40 && uc[1] <= 0xA0) && (uc[1] != 0x7F)) // GBK/5
|
||||
((b[0] >= 0xA1 && b[0] <= 0xA9) && (b[1] >= 0xA1 && b[1] <= 0xFE)) || // GBK/1
|
||||
((b[0] >= 0xB0 && b[0] <= 0xF7) && (b[1] >= 0xA1 && b[1] <= 0xFE)) || // GBK/2
|
||||
((b[0] >= 0x81 && b[0] <= 0xA0) && (b[1] >= 0x40 && b[1] <= 0xFE) && (b[1] != 0x7F)) || // GBK/3
|
||||
((b[0] >= 0xAA && b[0] <= 0xFE) && (b[1] >= 0x40 && b[1] <= 0xA0) && (b[1] != 0x7F)) || // GBK/4
|
||||
((b[0] >= 0xA8 && b[0] <= 0xA9) && (b[1] >= 0x40 && b[1] <= 0xA0) && (b[1] != 0x7F)) // GBK/5
|
||||
)
|
||||
) {
|
||||
*width = 2;
|
||||
return (yp_gbk_codepoint_t) (uc[0] << 8 | uc[1]);
|
||||
return 2;
|
||||
}
|
||||
|
||||
*width = 0;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static size_t
|
||||
yp_encoding_gbk_char_width(const char *c, ptrdiff_t n) {
|
||||
size_t width;
|
||||
yp_gbk_codepoint(c, n, &width);
|
||||
|
||||
return width;
|
||||
}
|
||||
|
||||
static size_t
|
||||
yp_encoding_gbk_alpha_char(const char *c, ptrdiff_t n) {
|
||||
size_t width;
|
||||
yp_gbk_codepoint_t codepoint = yp_gbk_codepoint(c, n, &width);
|
||||
|
||||
if (width == 1) {
|
||||
const char value = (const char) codepoint;
|
||||
return yp_encoding_ascii_alpha_char(&value, n);
|
||||
yp_encoding_gbk_alpha_char(const uint8_t *b, ptrdiff_t n) {
|
||||
if (yp_encoding_gbk_char_width(b, n) == 1) {
|
||||
return yp_encoding_ascii_alpha_char(b, n);
|
||||
} else {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
static size_t
|
||||
yp_encoding_gbk_alnum_char(const char *c, ptrdiff_t n) {
|
||||
size_t width;
|
||||
yp_gbk_codepoint_t codepoint = yp_gbk_codepoint(c, n, &width);
|
||||
|
||||
if (width == 1) {
|
||||
const char value = (const char) codepoint;
|
||||
return yp_encoding_ascii_alnum_char(&value, n);
|
||||
yp_encoding_gbk_alnum_char(const uint8_t *b, ptrdiff_t n) {
|
||||
if (yp_encoding_gbk_char_width(b, n) == 1) {
|
||||
return yp_encoding_ascii_alnum_char(b, n);
|
||||
} else {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
static bool
|
||||
yp_encoding_gbk_isupper_char(const char *c, ptrdiff_t n) {
|
||||
size_t width;
|
||||
yp_gbk_codepoint_t codepoint = yp_gbk_codepoint(c, n, &width);
|
||||
|
||||
if (width == 1) {
|
||||
const char value = (const char) codepoint;
|
||||
return yp_encoding_ascii_isupper_char(&value, n);
|
||||
yp_encoding_gbk_isupper_char(const uint8_t *b, ptrdiff_t n) {
|
||||
if (yp_encoding_gbk_char_width(b, n) == 1) {
|
||||
return yp_encoding_ascii_isupper_char(b, n);
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
|
@ -1,73 +1,46 @@
|
||||
#include "yarp/enc/yp_encoding.h"
|
||||
|
||||
typedef uint16_t yp_shift_jis_codepoint_t;
|
||||
|
||||
static yp_shift_jis_codepoint_t
|
||||
yp_shift_jis_codepoint(const char *c, ptrdiff_t n, size_t *width) {
|
||||
const unsigned char *uc = (const unsigned char *) c;
|
||||
|
||||
static size_t
|
||||
yp_encoding_shift_jis_char_width(const uint8_t *b, ptrdiff_t n) {
|
||||
// These are the single byte characters.
|
||||
if (*uc < 0x80 || (*uc >= 0xA1 && *uc <= 0xDF)) {
|
||||
*width = 1;
|
||||
return *uc;
|
||||
if (*b < 0x80 || (*b >= 0xA1 && *b <= 0xDF)) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
// These are the double byte characters.
|
||||
if (
|
||||
(n > 1) &&
|
||||
((uc[0] >= 0x81 && uc[0] <= 0x9F) || (uc[0] >= 0xE0 && uc[0] <= 0xFC)) &&
|
||||
(uc[1] >= 0x40 && uc[1] <= 0xFC)
|
||||
((b[0] >= 0x81 && b[0] <= 0x9F) || (b[0] >= 0xE0 && b[0] <= 0xFC)) &&
|
||||
(b[1] >= 0x40 && b[1] <= 0xFC)
|
||||
) {
|
||||
*width = 2;
|
||||
return (yp_shift_jis_codepoint_t) (uc[0] << 8 | uc[1]);
|
||||
return 2;
|
||||
}
|
||||
|
||||
*width = 0;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static size_t
|
||||
yp_encoding_shift_jis_char_width(const char *c, ptrdiff_t n) {
|
||||
size_t width;
|
||||
yp_shift_jis_codepoint(c, n, &width);
|
||||
|
||||
return width;
|
||||
}
|
||||
|
||||
static size_t
|
||||
yp_encoding_shift_jis_alpha_char(const char *c, ptrdiff_t n) {
|
||||
size_t width;
|
||||
yp_shift_jis_codepoint_t codepoint = yp_shift_jis_codepoint(c, n, &width);
|
||||
|
||||
if (width == 1) {
|
||||
const char value = (const char) codepoint;
|
||||
return yp_encoding_ascii_alpha_char(&value, n);
|
||||
yp_encoding_shift_jis_alpha_char(const uint8_t *b, ptrdiff_t n) {
|
||||
if (yp_encoding_shift_jis_char_width(b, n) == 1) {
|
||||
return yp_encoding_ascii_alpha_char(b, n);
|
||||
} else {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
static size_t
|
||||
yp_encoding_shift_jis_alnum_char(const char *c, ptrdiff_t n) {
|
||||
size_t width;
|
||||
yp_shift_jis_codepoint_t codepoint = yp_shift_jis_codepoint(c, n, &width);
|
||||
|
||||
if (width == 1) {
|
||||
const char value = (const char) codepoint;
|
||||
return yp_encoding_ascii_alnum_char(&value, n);
|
||||
yp_encoding_shift_jis_alnum_char(const uint8_t *b, ptrdiff_t n) {
|
||||
if (yp_encoding_shift_jis_char_width(b, n) == 1) {
|
||||
return yp_encoding_ascii_alnum_char(b, n);
|
||||
} else {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
static bool
|
||||
yp_encoding_shift_jis_isupper_char(const char *c, ptrdiff_t n) {
|
||||
size_t width;
|
||||
yp_shift_jis_codepoint_t codepoint = yp_shift_jis_codepoint(c, n, &width);
|
||||
|
||||
if (width == 1) {
|
||||
const char value = (const char) codepoint;
|
||||
return yp_encoding_ascii_isupper_char(&value, n);
|
||||
yp_encoding_shift_jis_isupper_char(const uint8_t *b, ptrdiff_t n) {
|
||||
if (yp_encoding_shift_jis_char_width(b, n) == 1) {
|
||||
return yp_encoding_ascii_isupper_char(b, n);
|
||||
} else {
|
||||
return 0;
|
||||
}
|
||||
|
@ -2,7 +2,7 @@
|
||||
|
||||
// Each element of the following table contains a bitfield that indicates a
|
||||
// piece of information about the corresponding ASCII character.
|
||||
static unsigned char yp_encoding_ascii_table[256] = {
|
||||
static uint8_t yp_encoding_ascii_table[256] = {
|
||||
// 0 1 2 3 4 5 6 7 8 9 A B C D E F
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 1x
|
||||
@ -24,7 +24,7 @@ static unsigned char yp_encoding_ascii_table[256] = {
|
||||
|
||||
// Each element of the following table contains a bitfield that indicates a
|
||||
// piece of information about the corresponding ISO-8859-1 character.
|
||||
static unsigned char yp_encoding_iso_8859_1_table[256] = {
|
||||
static uint8_t yp_encoding_iso_8859_1_table[256] = {
|
||||
// 0 1 2 3 4 5 6 7 8 9 A B C D E F
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 1x
|
||||
@ -46,7 +46,7 @@ static unsigned char yp_encoding_iso_8859_1_table[256] = {
|
||||
|
||||
// Each element of the following table contains a bitfield that indicates a
|
||||
// piece of information about the corresponding ISO-8859-2 character.
|
||||
static unsigned char yp_encoding_iso_8859_2_table[256] = {
|
||||
static uint8_t yp_encoding_iso_8859_2_table[256] = {
|
||||
// 0 1 2 3 4 5 6 7 8 9 A B C D E F
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 1x
|
||||
@ -68,7 +68,7 @@ static unsigned char yp_encoding_iso_8859_2_table[256] = {
|
||||
|
||||
// Each element of the following table contains a bitfield that indicates a
|
||||
// piece of information about the corresponding ISO-8859-3 character.
|
||||
static unsigned char yp_encoding_iso_8859_3_table[256] = {
|
||||
static uint8_t yp_encoding_iso_8859_3_table[256] = {
|
||||
// 0 1 2 3 4 5 6 7 8 9 A B C D E F
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 1x
|
||||
@ -90,7 +90,7 @@ static unsigned char yp_encoding_iso_8859_3_table[256] = {
|
||||
|
||||
// Each element of the following table contains a bitfield that indicates a
|
||||
// piece of information about the corresponding ISO-8859-4 character.
|
||||
static unsigned char yp_encoding_iso_8859_4_table[256] = {
|
||||
static uint8_t yp_encoding_iso_8859_4_table[256] = {
|
||||
// 0 1 2 3 4 5 6 7 8 9 A B C D E F
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 1x
|
||||
@ -112,7 +112,7 @@ static unsigned char yp_encoding_iso_8859_4_table[256] = {
|
||||
|
||||
// Each element of the following table contains a bitfield that indicates a
|
||||
// piece of information about the corresponding ISO-8859-5 character.
|
||||
static unsigned char yp_encoding_iso_8859_5_table[256] = {
|
||||
static uint8_t yp_encoding_iso_8859_5_table[256] = {
|
||||
// 0 1 2 3 4 5 6 7 8 9 A B C D E F
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 1x
|
||||
@ -134,7 +134,7 @@ static unsigned char yp_encoding_iso_8859_5_table[256] = {
|
||||
|
||||
// Each element of the following table contains a bitfield that indicates a
|
||||
// piece of information about the corresponding ISO-8859-6 character.
|
||||
static unsigned char yp_encoding_iso_8859_6_table[256] = {
|
||||
static uint8_t yp_encoding_iso_8859_6_table[256] = {
|
||||
// 0 1 2 3 4 5 6 7 8 9 A B C D E F
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 1x
|
||||
@ -156,7 +156,7 @@ static unsigned char yp_encoding_iso_8859_6_table[256] = {
|
||||
|
||||
// Each element of the following table contains a bitfield that indicates a
|
||||
// piece of information about the corresponding ISO-8859-7 character.
|
||||
static unsigned char yp_encoding_iso_8859_7_table[256] = {
|
||||
static uint8_t yp_encoding_iso_8859_7_table[256] = {
|
||||
// 0 1 2 3 4 5 6 7 8 9 A B C D E F
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 1x
|
||||
@ -178,7 +178,7 @@ static unsigned char yp_encoding_iso_8859_7_table[256] = {
|
||||
|
||||
// Each element of the following table contains a bitfield that indicates a
|
||||
// piece of information about the corresponding ISO-8859-8 character.
|
||||
static unsigned char yp_encoding_iso_8859_8_table[256] = {
|
||||
static uint8_t yp_encoding_iso_8859_8_table[256] = {
|
||||
// 0 1 2 3 4 5 6 7 8 9 A B C D E F
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 1x
|
||||
@ -200,7 +200,7 @@ static unsigned char yp_encoding_iso_8859_8_table[256] = {
|
||||
|
||||
// Each element of the following table contains a bitfield that indicates a
|
||||
// piece of information about the corresponding ISO-8859-9 character.
|
||||
static unsigned char yp_encoding_iso_8859_9_table[256] = {
|
||||
static uint8_t yp_encoding_iso_8859_9_table[256] = {
|
||||
// 0 1 2 3 4 5 6 7 8 9 A B C D E F
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 1x
|
||||
@ -222,7 +222,7 @@ static unsigned char yp_encoding_iso_8859_9_table[256] = {
|
||||
|
||||
// Each element of the following table contains a bitfield that indicates a
|
||||
// piece of information about the corresponding ISO-8859-10 character.
|
||||
static unsigned char yp_encoding_iso_8859_10_table[256] = {
|
||||
static uint8_t yp_encoding_iso_8859_10_table[256] = {
|
||||
// 0 1 2 3 4 5 6 7 8 9 A B C D E F
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 1x
|
||||
@ -244,7 +244,7 @@ static unsigned char yp_encoding_iso_8859_10_table[256] = {
|
||||
|
||||
// Each element of the following table contains a bitfield that indicates a
|
||||
// piece of information about the corresponding ISO-8859-11 character.
|
||||
static unsigned char yp_encoding_iso_8859_11_table[256] = {
|
||||
static uint8_t yp_encoding_iso_8859_11_table[256] = {
|
||||
// 0 1 2 3 4 5 6 7 8 9 A B C D E F
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 1x
|
||||
@ -266,7 +266,7 @@ static unsigned char yp_encoding_iso_8859_11_table[256] = {
|
||||
|
||||
// Each element of the following table contains a bitfield that indicates a
|
||||
// piece of information about the corresponding ISO-8859-13 character.
|
||||
static unsigned char yp_encoding_iso_8859_13_table[256] = {
|
||||
static uint8_t yp_encoding_iso_8859_13_table[256] = {
|
||||
// 0 1 2 3 4 5 6 7 8 9 A B C D E F
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 1x
|
||||
@ -288,7 +288,7 @@ static unsigned char yp_encoding_iso_8859_13_table[256] = {
|
||||
|
||||
// Each element of the following table contains a bitfield that indicates a
|
||||
// piece of information about the corresponding ISO-8859-14 character.
|
||||
static unsigned char yp_encoding_iso_8859_14_table[256] = {
|
||||
static uint8_t yp_encoding_iso_8859_14_table[256] = {
|
||||
// 0 1 2 3 4 5 6 7 8 9 A B C D E F
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 1x
|
||||
@ -310,7 +310,7 @@ static unsigned char yp_encoding_iso_8859_14_table[256] = {
|
||||
|
||||
// Each element of the following table contains a bitfield that indicates a
|
||||
// piece of information about the corresponding ISO-8859-15 character.
|
||||
static unsigned char yp_encoding_iso_8859_15_table[256] = {
|
||||
static uint8_t yp_encoding_iso_8859_15_table[256] = {
|
||||
// 0 1 2 3 4 5 6 7 8 9 A B C D E F
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 1x
|
||||
@ -332,7 +332,7 @@ static unsigned char yp_encoding_iso_8859_15_table[256] = {
|
||||
|
||||
// Each element of the following table contains a bitfield that indicates a
|
||||
// piece of information about the corresponding ISO-8859-16 character.
|
||||
static unsigned char yp_encoding_iso_8859_16_table[256] = {
|
||||
static uint8_t yp_encoding_iso_8859_16_table[256] = {
|
||||
// 0 1 2 3 4 5 6 7 8 9 A B C D E F
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 1x
|
||||
@ -354,7 +354,7 @@ static unsigned char yp_encoding_iso_8859_16_table[256] = {
|
||||
|
||||
// Each element of the following table contains a bitfield that indicates a
|
||||
// piece of information about the corresponding KOI8-R character.
|
||||
static unsigned char yp_encoding_koi8_r_table[256] = {
|
||||
static uint8_t yp_encoding_koi8_r_table[256] = {
|
||||
// 0 1 2 3 4 5 6 7 8 9 A B C D E F
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 1x
|
||||
@ -376,7 +376,7 @@ static unsigned char yp_encoding_koi8_r_table[256] = {
|
||||
|
||||
// Each element of the following table contains a bitfield that indicates a
|
||||
// piece of information about the corresponding windows-1251 character.
|
||||
static unsigned char yp_encoding_windows_1251_table[256] = {
|
||||
static uint8_t yp_encoding_windows_1251_table[256] = {
|
||||
// 0 1 2 3 4 5 6 7 8 9 A B C D E F
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 1x
|
||||
@ -398,7 +398,7 @@ static unsigned char yp_encoding_windows_1251_table[256] = {
|
||||
|
||||
// Each element of the following table contains a bitfield that indicates a
|
||||
// piece of information about the corresponding windows-1252 character.
|
||||
static unsigned char yp_encoding_windows_1252_table[256] = {
|
||||
static uint8_t yp_encoding_windows_1252_table[256] = {
|
||||
// 0 1 2 3 4 5 6 7 8 9 A B C D E F
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 1x
|
||||
@ -419,34 +419,32 @@ static unsigned char yp_encoding_windows_1252_table[256] = {
|
||||
};
|
||||
|
||||
static size_t
|
||||
yp_encoding_ascii_char_width(const char *c, YP_ATTRIBUTE_UNUSED ptrdiff_t n) {
|
||||
const unsigned char v = (const unsigned char) *c;
|
||||
return v < 0x80 ? 1 : 0;
|
||||
yp_encoding_ascii_char_width(const uint8_t *b, YP_ATTRIBUTE_UNUSED ptrdiff_t n) {
|
||||
return *b < 0x80 ? 1 : 0;
|
||||
}
|
||||
|
||||
size_t
|
||||
yp_encoding_ascii_alpha_char(const char *c, YP_ATTRIBUTE_UNUSED ptrdiff_t n) {
|
||||
return (yp_encoding_ascii_table[(const unsigned char) *c] & YP_ENCODING_ALPHABETIC_BIT);
|
||||
yp_encoding_ascii_alpha_char(const uint8_t *b, YP_ATTRIBUTE_UNUSED ptrdiff_t n) {
|
||||
return (yp_encoding_ascii_table[*b] & YP_ENCODING_ALPHABETIC_BIT);
|
||||
}
|
||||
|
||||
size_t
|
||||
yp_encoding_ascii_alnum_char(const char *c, YP_ATTRIBUTE_UNUSED ptrdiff_t n) {
|
||||
return (yp_encoding_ascii_table[(const unsigned char) *c] & YP_ENCODING_ALPHANUMERIC_BIT) ? 1 : 0;
|
||||
yp_encoding_ascii_alnum_char(const uint8_t *b, YP_ATTRIBUTE_UNUSED ptrdiff_t n) {
|
||||
return (yp_encoding_ascii_table[*b] & YP_ENCODING_ALPHANUMERIC_BIT) ? 1 : 0;
|
||||
}
|
||||
|
||||
bool
|
||||
yp_encoding_ascii_isupper_char(const char *c, YP_ATTRIBUTE_UNUSED ptrdiff_t n) {
|
||||
return (yp_encoding_ascii_table[(const unsigned char) *c] & YP_ENCODING_UPPERCASE_BIT);
|
||||
yp_encoding_ascii_isupper_char(const uint8_t *b, YP_ATTRIBUTE_UNUSED ptrdiff_t n) {
|
||||
return (yp_encoding_ascii_table[*b] & YP_ENCODING_UPPERCASE_BIT);
|
||||
}
|
||||
|
||||
static size_t
|
||||
yp_encoding_koi8_r_char_width(const char *c, YP_ATTRIBUTE_UNUSED ptrdiff_t n) {
|
||||
const unsigned char v = (const unsigned char) *c;
|
||||
return ((v >= 0x20 && v <= 0x7E) || (v >= 0x80)) ? 1 : 0;
|
||||
yp_encoding_koi8_r_char_width(const uint8_t *b, YP_ATTRIBUTE_UNUSED ptrdiff_t n) {
|
||||
return ((*b >= 0x20 && *b <= 0x7E) || (*b >= 0x80)) ? 1 : 0;
|
||||
}
|
||||
|
||||
static size_t
|
||||
yp_encoding_single_char_width(YP_ATTRIBUTE_UNUSED const char *c, YP_ATTRIBUTE_UNUSED ptrdiff_t n) {
|
||||
yp_encoding_single_char_width(YP_ATTRIBUTE_UNUSED const uint8_t *b, YP_ATTRIBUTE_UNUSED ptrdiff_t n) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
@ -469,14 +467,14 @@ yp_encoding_t yp_encoding_ascii_8bit = {
|
||||
};
|
||||
|
||||
#define YP_ENCODING_TABLE(s, i, w) \
|
||||
static size_t yp_encoding_ ##i ## _alpha_char(const char *c, YP_ATTRIBUTE_UNUSED ptrdiff_t n) { \
|
||||
return (yp_encoding_ ##i ## _table[(const unsigned char) *c] & YP_ENCODING_ALPHABETIC_BIT); \
|
||||
static size_t yp_encoding_ ##i ## _alpha_char(const uint8_t *b, YP_ATTRIBUTE_UNUSED ptrdiff_t n) { \
|
||||
return (yp_encoding_ ##i ## _table[*b] & YP_ENCODING_ALPHABETIC_BIT); \
|
||||
} \
|
||||
static size_t yp_encoding_ ##i ## _alnum_char(const char *c, YP_ATTRIBUTE_UNUSED ptrdiff_t n) { \
|
||||
return (yp_encoding_ ##i ## _table[(const unsigned char) *c] & YP_ENCODING_ALPHANUMERIC_BIT) ? 1 : 0; \
|
||||
static size_t yp_encoding_ ##i ## _alnum_char(const uint8_t *b, YP_ATTRIBUTE_UNUSED ptrdiff_t n) { \
|
||||
return (yp_encoding_ ##i ## _table[*b] & YP_ENCODING_ALPHANUMERIC_BIT) ? 1 : 0; \
|
||||
} \
|
||||
static bool yp_encoding_ ##i ## _isupper_char(const char *c, YP_ATTRIBUTE_UNUSED ptrdiff_t n) { \
|
||||
return (yp_encoding_ ##i ## _table[(const unsigned char) *c] & YP_ENCODING_UPPERCASE_BIT); \
|
||||
static bool yp_encoding_ ##i ## _isupper_char(const uint8_t *b, YP_ATTRIBUTE_UNUSED ptrdiff_t n) { \
|
||||
return (yp_encoding_ ##i ## _table[*b] & YP_ENCODING_UPPERCASE_BIT); \
|
||||
} \
|
||||
yp_encoding_t yp_encoding_ ##i = { \
|
||||
.name = s, \
|
||||
|
@ -10,7 +10,7 @@ typedef uint32_t yp_unicode_codepoint_t;
|
||||
// this table is different from other encodings where we used a lookup table
|
||||
// because the indices of those tables are the byte representations, not the
|
||||
// codepoints themselves.
|
||||
unsigned char yp_encoding_unicode_table[256] = {
|
||||
uint8_t yp_encoding_unicode_table[256] = {
|
||||
// 0 1 2 3 4 5 6 7 8 9 A B C D E F
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 1x
|
||||
@ -2220,7 +2220,7 @@ static const uint8_t yp_utf_8_dfa[] = {
|
||||
};
|
||||
|
||||
static yp_unicode_codepoint_t
|
||||
yp_utf_8_codepoint(const unsigned char *c, ptrdiff_t n, size_t *width) {
|
||||
yp_utf_8_codepoint(const uint8_t *b, ptrdiff_t n, size_t *width) {
|
||||
assert(n >= 1);
|
||||
size_t maximum = (size_t) n;
|
||||
|
||||
@ -2228,7 +2228,7 @@ yp_utf_8_codepoint(const unsigned char *c, ptrdiff_t n, size_t *width) {
|
||||
uint32_t state = 0;
|
||||
|
||||
for (size_t index = 0; index < 4 && index < maximum; index++) {
|
||||
uint32_t byte = c[index];
|
||||
uint32_t byte = b[index];
|
||||
uint32_t type = yp_utf_8_dfa[byte];
|
||||
|
||||
codepoint = (state != 0) ?
|
||||
@ -2247,60 +2247,55 @@ yp_utf_8_codepoint(const unsigned char *c, ptrdiff_t n, size_t *width) {
|
||||
}
|
||||
|
||||
static size_t
|
||||
yp_encoding_utf_8_char_width(const char *c, ptrdiff_t n) {
|
||||
yp_encoding_utf_8_char_width(const uint8_t *b, ptrdiff_t n) {
|
||||
size_t width;
|
||||
const unsigned char *v = (const unsigned char *) c;
|
||||
|
||||
yp_utf_8_codepoint(v, n, &width);
|
||||
yp_utf_8_codepoint(b, n, &width);
|
||||
return width;
|
||||
}
|
||||
|
||||
size_t
|
||||
yp_encoding_utf_8_alpha_char(const char *c, ptrdiff_t n) {
|
||||
const unsigned char *v = (const unsigned char *) c;
|
||||
if (*v < 0x80) {
|
||||
return (yp_encoding_unicode_table[*v] & YP_ENCODING_ALPHABETIC_BIT) ? 1 : 0;
|
||||
yp_encoding_utf_8_alpha_char(const uint8_t *b, ptrdiff_t n) {
|
||||
if (*b < 0x80) {
|
||||
return (yp_encoding_unicode_table[*b] & YP_ENCODING_ALPHABETIC_BIT) ? 1 : 0;
|
||||
}
|
||||
|
||||
size_t width;
|
||||
yp_unicode_codepoint_t codepoint = yp_utf_8_codepoint(v, n, &width);
|
||||
yp_unicode_codepoint_t codepoint = yp_utf_8_codepoint(b, n, &width);
|
||||
|
||||
if (codepoint <= 0xFF) {
|
||||
return (yp_encoding_unicode_table[(unsigned char) codepoint] & YP_ENCODING_ALPHABETIC_BIT) ? width : 0;
|
||||
return (yp_encoding_unicode_table[(uint8_t) codepoint] & YP_ENCODING_ALPHABETIC_BIT) ? width : 0;
|
||||
} else {
|
||||
return yp_unicode_codepoint_match(codepoint, unicode_alpha_codepoints, UNICODE_ALPHA_CODEPOINTS_LENGTH) ? width : 0;
|
||||
}
|
||||
}
|
||||
|
||||
size_t
|
||||
yp_encoding_utf_8_alnum_char(const char *c, ptrdiff_t n) {
|
||||
const unsigned char *v = (const unsigned char *) c;
|
||||
if (*v < 0x80) {
|
||||
return (yp_encoding_unicode_table[*v] & (YP_ENCODING_ALPHANUMERIC_BIT)) ? 1 : 0;
|
||||
yp_encoding_utf_8_alnum_char(const uint8_t *b, ptrdiff_t n) {
|
||||
if (*b < 0x80) {
|
||||
return (yp_encoding_unicode_table[*b] & (YP_ENCODING_ALPHANUMERIC_BIT)) ? 1 : 0;
|
||||
}
|
||||
|
||||
size_t width;
|
||||
yp_unicode_codepoint_t codepoint = yp_utf_8_codepoint(v, n, &width);
|
||||
yp_unicode_codepoint_t codepoint = yp_utf_8_codepoint(b, n, &width);
|
||||
|
||||
if (codepoint <= 0xFF) {
|
||||
return (yp_encoding_unicode_table[(unsigned char) codepoint] & (YP_ENCODING_ALPHANUMERIC_BIT)) ? width : 0;
|
||||
return (yp_encoding_unicode_table[(uint8_t) codepoint] & (YP_ENCODING_ALPHANUMERIC_BIT)) ? width : 0;
|
||||
} else {
|
||||
return yp_unicode_codepoint_match(codepoint, unicode_alnum_codepoints, UNICODE_ALNUM_CODEPOINTS_LENGTH) ? width : 0;
|
||||
}
|
||||
}
|
||||
|
||||
static bool
|
||||
yp_encoding_utf_8_isupper_char(const char *c, ptrdiff_t n) {
|
||||
const unsigned char *v = (const unsigned char *) c;
|
||||
if (*v < 0x80) {
|
||||
return (yp_encoding_unicode_table[*v] & YP_ENCODING_UPPERCASE_BIT) ? true : false;
|
||||
yp_encoding_utf_8_isupper_char(const uint8_t *b, ptrdiff_t n) {
|
||||
if (*b < 0x80) {
|
||||
return (yp_encoding_unicode_table[*b] & YP_ENCODING_UPPERCASE_BIT) ? true : false;
|
||||
}
|
||||
|
||||
size_t width;
|
||||
yp_unicode_codepoint_t codepoint = yp_utf_8_codepoint(v, n, &width);
|
||||
yp_unicode_codepoint_t codepoint = yp_utf_8_codepoint(b, n, &width);
|
||||
|
||||
if (codepoint <= 0xFF) {
|
||||
return (yp_encoding_unicode_table[(unsigned char) codepoint] & YP_ENCODING_UPPERCASE_BIT) ? true : false;
|
||||
return (yp_encoding_unicode_table[(uint8_t) codepoint] & YP_ENCODING_UPPERCASE_BIT) ? true : false;
|
||||
} else {
|
||||
return yp_unicode_codepoint_match(codepoint, unicode_isupper_codepoints, UNICODE_ISUPPER_CODEPOINTS_LENGTH) ? true : false;
|
||||
}
|
||||
|
@ -1,73 +1,46 @@
|
||||
#include "yarp/enc/yp_encoding.h"
|
||||
|
||||
typedef uint16_t yp_windows_31j_codepoint_t;
|
||||
|
||||
static yp_windows_31j_codepoint_t
|
||||
yp_windows_31j_codepoint(const char *c, ptrdiff_t n, size_t *width) {
|
||||
const unsigned char *uc = (const unsigned char *) c;
|
||||
|
||||
static size_t
|
||||
yp_encoding_windows_31j_char_width(const uint8_t *b, ptrdiff_t n) {
|
||||
// These are the single byte characters.
|
||||
if (*uc < 0x80 || (*uc >= 0xA1 && *uc <= 0xDF)) {
|
||||
*width = 1;
|
||||
return *uc;
|
||||
if (*b < 0x80 || (*b >= 0xA1 && *b <= 0xDF)) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
// These are the double byte characters.
|
||||
if (
|
||||
(n > 1) &&
|
||||
((uc[0] >= 0x81 && uc[0] <= 0x9F) || (uc[0] >= 0xE0 && uc[0] <= 0xFC)) &&
|
||||
(uc[1] >= 0x40 && uc[1] <= 0xFC)
|
||||
((b[0] >= 0x81 && b[0] <= 0x9F) || (b[0] >= 0xE0 && b[0] <= 0xFC)) &&
|
||||
(b[1] >= 0x40 && b[1] <= 0xFC)
|
||||
) {
|
||||
*width = 2;
|
||||
return (yp_windows_31j_codepoint_t) (uc[0] << 8 | uc[1]);
|
||||
return 2;
|
||||
}
|
||||
|
||||
*width = 0;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static size_t
|
||||
yp_encoding_windows_31j_char_width(const char *c, ptrdiff_t n) {
|
||||
size_t width;
|
||||
yp_windows_31j_codepoint(c, n, &width);
|
||||
|
||||
return width;
|
||||
}
|
||||
|
||||
static size_t
|
||||
yp_encoding_windows_31j_alpha_char(const char *c, ptrdiff_t n) {
|
||||
size_t width;
|
||||
yp_windows_31j_codepoint_t codepoint = yp_windows_31j_codepoint(c, n, &width);
|
||||
|
||||
if (width == 1) {
|
||||
const char value = (const char) codepoint;
|
||||
return yp_encoding_ascii_alpha_char(&value, n);
|
||||
yp_encoding_windows_31j_alpha_char(const uint8_t *b, ptrdiff_t n) {
|
||||
if (yp_encoding_windows_31j_char_width(b, n) == 1) {
|
||||
return yp_encoding_ascii_alpha_char(b, n);
|
||||
} else {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
static size_t
|
||||
yp_encoding_windows_31j_alnum_char(const char *c, ptrdiff_t n) {
|
||||
size_t width;
|
||||
yp_windows_31j_codepoint_t codepoint = yp_windows_31j_codepoint(c, n, &width);
|
||||
|
||||
if (width == 1) {
|
||||
const char value = (const char) codepoint;
|
||||
return yp_encoding_ascii_alnum_char(&value, n);
|
||||
yp_encoding_windows_31j_alnum_char(const uint8_t *b, ptrdiff_t n) {
|
||||
if (yp_encoding_windows_31j_char_width(b, n) == 1) {
|
||||
return yp_encoding_ascii_alnum_char(b, n);
|
||||
} else {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
static bool
|
||||
yp_encoding_windows_31j_isupper_char(const char *c, ptrdiff_t n) {
|
||||
size_t width;
|
||||
yp_windows_31j_codepoint_t codepoint = yp_windows_31j_codepoint(c, n, &width);
|
||||
|
||||
if (width == 1) {
|
||||
const char value = (const char) codepoint;
|
||||
return yp_encoding_ascii_isupper_char(&value, n);
|
||||
yp_encoding_windows_31j_isupper_char(const uint8_t *b, ptrdiff_t n) {
|
||||
if (yp_encoding_windows_31j_char_width(b, n) == 1) {
|
||||
return yp_encoding_ascii_isupper_char(b, n);
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
|
@ -260,7 +260,7 @@ parse_lex_input(yp_string_t *input, const char *filepath, bool return_nodes) {
|
||||
yp_parser_register_encoding_changed_callback(&parser, parse_lex_encoding_changed_callback);
|
||||
|
||||
VALUE offsets = rb_ary_new();
|
||||
VALUE source_argv[] = { rb_str_new(yp_string_source(input), yp_string_length(input)), offsets };
|
||||
VALUE source_argv[] = { rb_str_new((const char *) yp_string_source(input), yp_string_length(input)), offsets };
|
||||
VALUE source = rb_class_new_instance(2, source_argv, rb_cYARPSource);
|
||||
|
||||
parse_lex_data_t parse_lex_data = {
|
||||
@ -442,7 +442,7 @@ named_captures(VALUE self, VALUE source) {
|
||||
yp_string_list_t string_list;
|
||||
yp_string_list_init(&string_list);
|
||||
|
||||
if (!yp_regexp_named_capture_group_names(RSTRING_PTR(source), RSTRING_LEN(source), &string_list, false, &yp_encoding_utf_8)) {
|
||||
if (!yp_regexp_named_capture_group_names((const uint8_t *) RSTRING_PTR(source), RSTRING_LEN(source), &string_list, false, &yp_encoding_utf_8)) {
|
||||
yp_string_list_free(&string_list);
|
||||
return Qnil;
|
||||
}
|
||||
@ -450,7 +450,7 @@ named_captures(VALUE self, VALUE source) {
|
||||
VALUE names = rb_ary_new();
|
||||
for (size_t index = 0; index < string_list.length; index++) {
|
||||
const yp_string_t *string = &string_list.strings[index];
|
||||
rb_ary_push(names, rb_str_new(yp_string_source(string), yp_string_length(string)));
|
||||
rb_ary_push(names, rb_str_new((const char *) yp_string_source(string), yp_string_length(string)));
|
||||
}
|
||||
|
||||
yp_string_list_free(&string_list);
|
||||
@ -463,8 +463,8 @@ static VALUE
|
||||
unescape(VALUE source, yp_unescape_type_t unescape_type) {
|
||||
yp_string_t result;
|
||||
|
||||
if (yp_unescape_string(RSTRING_PTR(source), RSTRING_LEN(source), unescape_type, &result)) {
|
||||
VALUE str = rb_str_new(yp_string_source(&result), yp_string_length(&result));
|
||||
if (yp_unescape_string((const uint8_t *) RSTRING_PTR(source), RSTRING_LEN(source), unescape_type, &result)) {
|
||||
VALUE str = rb_str_new((const char *) yp_string_source(&result), yp_string_length(&result));
|
||||
yp_string_free(&result);
|
||||
return str;
|
||||
} else {
|
||||
@ -498,7 +498,7 @@ static VALUE
|
||||
memsize(VALUE self, VALUE string) {
|
||||
yp_parser_t parser;
|
||||
size_t length = RSTRING_LEN(string);
|
||||
yp_parser_init(&parser, RSTRING_PTR(string), length, NULL);
|
||||
yp_parser_init(&parser, (const uint8_t *) RSTRING_PTR(string), length, NULL);
|
||||
|
||||
yp_node_t *node = yp_parse(&parser);
|
||||
yp_memsize_t memsize;
|
||||
|
@ -109,14 +109,14 @@ typedef struct yp_lex_mode {
|
||||
|
||||
// When lexing a list, it takes into account balancing the
|
||||
// terminator if the terminator is one of (), [], {}, or <>.
|
||||
char incrementor;
|
||||
uint8_t incrementor;
|
||||
|
||||
// This is the terminator of the list literal.
|
||||
char terminator;
|
||||
uint8_t terminator;
|
||||
|
||||
// This is the character set that should be used to delimit the
|
||||
// tokens within the list.
|
||||
char breakpoints[11];
|
||||
uint8_t breakpoints[11];
|
||||
} list;
|
||||
|
||||
struct {
|
||||
@ -125,14 +125,14 @@ typedef struct yp_lex_mode {
|
||||
|
||||
// When lexing a regular expression, it takes into account balancing
|
||||
// the terminator if the terminator is one of (), [], {}, or <>.
|
||||
char incrementor;
|
||||
uint8_t incrementor;
|
||||
|
||||
// This is the terminator of the regular expression.
|
||||
char terminator;
|
||||
uint8_t terminator;
|
||||
|
||||
// This is the character set that should be used to delimit the
|
||||
// tokens within the regular expression.
|
||||
char breakpoints[6];
|
||||
uint8_t breakpoints[6];
|
||||
} regexp;
|
||||
|
||||
struct {
|
||||
@ -149,21 +149,21 @@ typedef struct yp_lex_mode {
|
||||
|
||||
// When lexing a string, it takes into account balancing the
|
||||
// terminator if the terminator is one of (), [], {}, or <>.
|
||||
char incrementor;
|
||||
uint8_t incrementor;
|
||||
|
||||
// This is the terminator of the string. It is typically either a
|
||||
// single or double quote.
|
||||
char terminator;
|
||||
uint8_t terminator;
|
||||
|
||||
// This is the character set that should be used to delimit the
|
||||
// tokens within the string.
|
||||
char breakpoints[6];
|
||||
uint8_t breakpoints[6];
|
||||
} string;
|
||||
|
||||
struct {
|
||||
// These pointers point to the beginning and end of the heredoc
|
||||
// identifier.
|
||||
const char *ident_start;
|
||||
const uint8_t *ident_start;
|
||||
size_t ident_length;
|
||||
|
||||
yp_heredoc_quote_t quote;
|
||||
@ -171,7 +171,7 @@ typedef struct yp_lex_mode {
|
||||
|
||||
// This is the pointer to the character where lexing should resume
|
||||
// once the heredoc has been completely processed.
|
||||
const char *next_start;
|
||||
const uint8_t *next_start;
|
||||
} heredoc;
|
||||
} as;
|
||||
|
||||
@ -239,8 +239,8 @@ typedef enum {
|
||||
// This is a node in the linked list of comments that we've found while parsing.
|
||||
typedef struct yp_comment {
|
||||
yp_list_node_t node;
|
||||
const char *start;
|
||||
const char *end;
|
||||
const uint8_t *start;
|
||||
const uint8_t *end;
|
||||
yp_comment_type_t type;
|
||||
} yp_comment_t;
|
||||
|
||||
@ -252,7 +252,7 @@ typedef void (*yp_encoding_changed_callback_t)(yp_parser_t *parser);
|
||||
// the ability here to call out to a user-defined function to get an encoding
|
||||
// struct. If the function returns something that isn't NULL, we set that to
|
||||
// our encoding and use it to parse identifiers.
|
||||
typedef yp_encoding_t *(*yp_encoding_decode_callback_t)(yp_parser_t *parser, const char *name, size_t width);
|
||||
typedef yp_encoding_t *(*yp_encoding_decode_callback_t)(yp_parser_t *parser, const uint8_t *name, size_t width);
|
||||
|
||||
// When you are lexing through a file, the lexer needs all of the information
|
||||
// that the parser additionally provides (for example, the local table). So if
|
||||
@ -316,21 +316,21 @@ struct yp_parser {
|
||||
size_t index; // the current index into the lexer mode stack
|
||||
} lex_modes;
|
||||
|
||||
const char *start; // the pointer to the start of the source
|
||||
const char *end; // the pointer to the end of the source
|
||||
const uint8_t *start; // the pointer to the start of the source
|
||||
const uint8_t *end; // the pointer to the end of the source
|
||||
yp_token_t previous; // the previous token we were considering
|
||||
yp_token_t current; // the current token we're considering
|
||||
|
||||
// This is a special field set on the parser when we need the parser to jump
|
||||
// to a specific location when lexing the next token, as opposed to just
|
||||
// using the end of the previous token. Normally this is NULL.
|
||||
const char *next_start;
|
||||
const uint8_t *next_start;
|
||||
|
||||
// This field indicates the end of a heredoc whose identifier was found on
|
||||
// the current line. If another heredoc is found on the same line, then this
|
||||
// will be moved forward to the end of that heredoc. If no heredocs are
|
||||
// found on a line then this is NULL.
|
||||
const char *heredoc_end;
|
||||
const uint8_t *heredoc_end;
|
||||
|
||||
yp_list_t comment_list; // the list of comments that have been found while parsing
|
||||
yp_list_t warning_list; // the list of warnings that have been found while parsing
|
||||
@ -361,7 +361,7 @@ struct yp_parser {
|
||||
|
||||
// This pointer indicates where a comment must start if it is to be
|
||||
// considered an encoding comment.
|
||||
const char *encoding_comment_start;
|
||||
const uint8_t *encoding_comment_start;
|
||||
|
||||
// This is an optional callback that can be attached to the parser that will
|
||||
// be called whenever a new token is lexed by the parser.
|
||||
|
@ -2,9 +2,9 @@
|
||||
|
||||
// This is the parser that is going to handle parsing regular expressions.
|
||||
typedef struct {
|
||||
const char *start;
|
||||
const char *cursor;
|
||||
const char *end;
|
||||
const uint8_t *start;
|
||||
const uint8_t *cursor;
|
||||
const uint8_t *end;
|
||||
yp_string_list_t *named_captures;
|
||||
bool encoding_changed;
|
||||
yp_encoding_t *encoding;
|
||||
@ -12,7 +12,7 @@ typedef struct {
|
||||
|
||||
// This initializes a new parser with the given source.
|
||||
static void
|
||||
yp_regexp_parser_init(yp_regexp_parser_t *parser, const char *start, const char *end, yp_string_list_t *named_captures, bool encoding_changed, yp_encoding_t *encoding) {
|
||||
yp_regexp_parser_init(yp_regexp_parser_t *parser, const uint8_t *start, const uint8_t *end, yp_string_list_t *named_captures, bool encoding_changed, yp_encoding_t *encoding) {
|
||||
*parser = (yp_regexp_parser_t) {
|
||||
.start = start,
|
||||
.cursor = start,
|
||||
@ -25,7 +25,7 @@ yp_regexp_parser_init(yp_regexp_parser_t *parser, const char *start, const char
|
||||
|
||||
// This appends a new string to the list of named captures.
|
||||
static void
|
||||
yp_regexp_parser_named_capture(yp_regexp_parser_t *parser, const char *start, const char *end) {
|
||||
yp_regexp_parser_named_capture(yp_regexp_parser_t *parser, const uint8_t *start, const uint8_t *end) {
|
||||
yp_string_t string;
|
||||
yp_string_shared_init(&string, start, end);
|
||||
yp_string_list_append(parser->named_captures, &string);
|
||||
@ -40,7 +40,7 @@ yp_regexp_char_is_eof(yp_regexp_parser_t *parser) {
|
||||
|
||||
// Optionally accept a char and consume it if it exists.
|
||||
static inline bool
|
||||
yp_regexp_char_accept(yp_regexp_parser_t *parser, char value) {
|
||||
yp_regexp_char_accept(yp_regexp_parser_t *parser, uint8_t value) {
|
||||
if (!yp_regexp_char_is_eof(parser) && *parser->cursor == value) {
|
||||
parser->cursor++;
|
||||
return true;
|
||||
@ -50,7 +50,7 @@ yp_regexp_char_accept(yp_regexp_parser_t *parser, char value) {
|
||||
|
||||
// Expect a character to be present and consume it.
|
||||
static inline bool
|
||||
yp_regexp_char_expect(yp_regexp_parser_t *parser, char value) {
|
||||
yp_regexp_char_expect(yp_regexp_parser_t *parser, uint8_t value) {
|
||||
if (!yp_regexp_char_is_eof(parser) && *parser->cursor == value) {
|
||||
parser->cursor++;
|
||||
return true;
|
||||
@ -60,12 +60,12 @@ yp_regexp_char_expect(yp_regexp_parser_t *parser, char value) {
|
||||
|
||||
// This advances the current token to the next instance of the given character.
|
||||
static bool
|
||||
yp_regexp_char_find(yp_regexp_parser_t *parser, char value) {
|
||||
yp_regexp_char_find(yp_regexp_parser_t *parser, uint8_t value) {
|
||||
if (yp_regexp_char_is_eof(parser)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
const char *end = (const char *) yp_memchr(parser->cursor, value, (size_t) (parser->end - parser->cursor), parser->encoding_changed, parser->encoding);
|
||||
const uint8_t *end = (const uint8_t *) yp_memchr(parser->cursor, value, (size_t) (parser->end - parser->cursor), parser->encoding_changed, parser->encoding);
|
||||
if (end == NULL) {
|
||||
return false;
|
||||
}
|
||||
@ -107,7 +107,7 @@ yp_regexp_char_find(yp_regexp_parser_t *parser, char value) {
|
||||
// consumed so we're in the start state.
|
||||
static bool
|
||||
yp_regexp_parse_range_quantifier(yp_regexp_parser_t *parser) {
|
||||
const char *savepoint = parser->cursor;
|
||||
const uint8_t *savepoint = parser->cursor;
|
||||
|
||||
enum {
|
||||
YP_REGEXP_RANGE_QUANTIFIER_STATE_START,
|
||||
@ -252,7 +252,7 @@ yp_regexp_parse_character_set(yp_regexp_parser_t *parser) {
|
||||
// A left bracket can either mean a POSIX class or a character set.
|
||||
static bool
|
||||
yp_regexp_parse_lbracket(yp_regexp_parser_t *parser) {
|
||||
const char *reset = parser->cursor;
|
||||
const uint8_t *reset = parser->cursor;
|
||||
|
||||
if ((parser->cursor + 2 < parser->end) && parser->cursor[0] == '[' && parser->cursor[1] == ':') {
|
||||
parser->cursor++;
|
||||
@ -287,7 +287,7 @@ typedef enum {
|
||||
|
||||
// This is the set of options that are configurable on the regular expression.
|
||||
typedef struct {
|
||||
unsigned char values[YP_REGEXP_OPTION_STATE_SLOTS];
|
||||
uint8_t values[YP_REGEXP_OPTION_STATE_SLOTS];
|
||||
} yp_regexp_options_t;
|
||||
|
||||
// Initialize a new set of options to their default values.
|
||||
@ -305,9 +305,9 @@ yp_regexp_options_init(yp_regexp_options_t *options) {
|
||||
// Attempt to add the given option to the set of options. Returns true if it was
|
||||
// added, false if it was already present.
|
||||
static bool
|
||||
yp_regexp_options_add(yp_regexp_options_t *options, unsigned char key) {
|
||||
yp_regexp_options_add(yp_regexp_options_t *options, uint8_t key) {
|
||||
if (key >= YP_REGEXP_OPTION_STATE_SLOT_MINIMUM && key <= YP_REGEXP_OPTION_STATE_SLOT_MAXIMUM) {
|
||||
key = (unsigned char) (key - YP_REGEXP_OPTION_STATE_SLOT_MINIMUM);
|
||||
key = (uint8_t) (key - YP_REGEXP_OPTION_STATE_SLOT_MINIMUM);
|
||||
|
||||
switch (options->values[key]) {
|
||||
case YP_REGEXP_OPTION_STATE_INVALID:
|
||||
@ -328,9 +328,9 @@ yp_regexp_options_add(yp_regexp_options_t *options, unsigned char key) {
|
||||
// Attempt to remove the given option from the set of options. Returns true if
|
||||
// it was removed, false if it was already absent.
|
||||
static bool
|
||||
yp_regexp_options_remove(yp_regexp_options_t *options, unsigned char key) {
|
||||
yp_regexp_options_remove(yp_regexp_options_t *options, uint8_t key) {
|
||||
if (key >= YP_REGEXP_OPTION_STATE_SLOT_MINIMUM && key <= YP_REGEXP_OPTION_STATE_SLOT_MAXIMUM) {
|
||||
key = (unsigned char) (key - YP_REGEXP_OPTION_STATE_SLOT_MINIMUM);
|
||||
key = (uint8_t) (key - YP_REGEXP_OPTION_STATE_SLOT_MINIMUM);
|
||||
|
||||
switch (options->values[key]) {
|
||||
case YP_REGEXP_OPTION_STATE_INVALID:
|
||||
@ -431,7 +431,7 @@ yp_regexp_parse_group(yp_regexp_parser_t *parser) {
|
||||
parser->cursor++;
|
||||
break;
|
||||
default: { // named capture group
|
||||
const char *start = parser->cursor;
|
||||
const uint8_t *start = parser->cursor;
|
||||
if (!yp_regexp_char_find(parser, '>')) {
|
||||
return false;
|
||||
}
|
||||
@ -441,7 +441,7 @@ yp_regexp_parse_group(yp_regexp_parser_t *parser) {
|
||||
}
|
||||
break;
|
||||
case '\'': { // named capture group
|
||||
const char *start = ++parser->cursor;
|
||||
const uint8_t *start = ++parser->cursor;
|
||||
if (!yp_regexp_char_find(parser, '\'')) {
|
||||
return false;
|
||||
}
|
||||
@ -456,7 +456,7 @@ yp_regexp_parse_group(yp_regexp_parser_t *parser) {
|
||||
break;
|
||||
case 'i': case 'm': case 'x': case 'd': case 'a': case 'u': // options
|
||||
while (!yp_regexp_char_is_eof(parser) && *parser->cursor != '-' && *parser->cursor != ':' && *parser->cursor != ')') {
|
||||
if (!yp_regexp_options_add(&options, (unsigned char) *parser->cursor)) {
|
||||
if (!yp_regexp_options_add(&options, *parser->cursor)) {
|
||||
return false;
|
||||
}
|
||||
parser->cursor++;
|
||||
@ -474,7 +474,7 @@ yp_regexp_parse_group(yp_regexp_parser_t *parser) {
|
||||
case '-':
|
||||
parser->cursor++;
|
||||
while (!yp_regexp_char_is_eof(parser) && *parser->cursor != ':' && *parser->cursor != ')') {
|
||||
if (!yp_regexp_options_remove(&options, (unsigned char) *parser->cursor)) {
|
||||
if (!yp_regexp_options_remove(&options, *parser->cursor)) {
|
||||
return false;
|
||||
}
|
||||
parser->cursor++;
|
||||
@ -573,7 +573,7 @@ yp_regexp_parse_pattern(yp_regexp_parser_t *parser) {
|
||||
// Parse a regular expression and extract the names of all of the named capture
|
||||
// groups.
|
||||
YP_EXPORTED_FUNCTION bool
|
||||
yp_regexp_named_capture_group_names(const char *source, size_t size, yp_string_list_t *named_captures, bool encoding_changed, yp_encoding_t *encoding) {
|
||||
yp_regexp_named_capture_group_names(const uint8_t *source, size_t size, yp_string_list_t *named_captures, bool encoding_changed, yp_encoding_t *encoding) {
|
||||
yp_regexp_parser_t parser;
|
||||
yp_regexp_parser_init(&parser, source, source + size, named_captures, encoding_changed, encoding);
|
||||
return yp_regexp_parse_pattern(&parser);
|
||||
|
@ -14,6 +14,6 @@
|
||||
|
||||
// Parse a regular expression and extract the names of all of the named capture
|
||||
// groups.
|
||||
YP_EXPORTED_FUNCTION bool yp_regexp_named_capture_group_names(const char *source, size_t size, yp_string_list_t *named_captures, bool encoding_changed, yp_encoding_t *encoding);
|
||||
YP_EXPORTED_FUNCTION bool yp_regexp_named_capture_group_names(const uint8_t *source, size_t size, yp_string_list_t *named_captures, bool encoding_changed, yp_encoding_t *encoding);
|
||||
|
||||
#endif
|
||||
|
@ -12,7 +12,7 @@ static VALUE rb_cYARP<%= node.name %>;
|
||||
<%- end -%>
|
||||
|
||||
static VALUE
|
||||
yp_location_new(yp_parser_t *parser, const char *start, const char *end, VALUE source) {
|
||||
yp_location_new(yp_parser_t *parser, const uint8_t *start, const uint8_t *end, VALUE source) {
|
||||
VALUE argv[] = { source, LONG2FIX(start - parser->start), LONG2FIX(end - start) };
|
||||
return rb_class_new_instance(3, argv, rb_cYARPLocation);
|
||||
}
|
||||
@ -24,7 +24,7 @@ yp_token_new(yp_parser_t *parser, yp_token_t *token, rb_encoding *encoding, VALU
|
||||
|
||||
VALUE argv[] = {
|
||||
ID2SYM(type),
|
||||
rb_enc_str_new(token->start, token->end - token->start, encoding),
|
||||
rb_enc_str_new((const char *) token->start, token->end - token->start, encoding),
|
||||
location
|
||||
};
|
||||
|
||||
@ -33,13 +33,13 @@ yp_token_new(yp_parser_t *parser, yp_token_t *token, rb_encoding *encoding, VALU
|
||||
|
||||
static VALUE
|
||||
yp_string_new(yp_string_t *string, rb_encoding *encoding) {
|
||||
return rb_enc_str_new(yp_string_source(string), yp_string_length(string), encoding);
|
||||
return rb_enc_str_new((const char *) yp_string_source(string), yp_string_length(string), encoding);
|
||||
}
|
||||
|
||||
// Create a YARP::Source object from the given parser.
|
||||
VALUE
|
||||
yp_source_new(yp_parser_t *parser) {
|
||||
VALUE source = rb_str_new(parser->start, parser->end - parser->start);
|
||||
VALUE source = rb_str_new((const char *) parser->start, parser->end - parser->start);
|
||||
VALUE offsets = rb_ary_new_capa(parser->newline_list.size);
|
||||
|
||||
for (size_t index = 0; index < parser->newline_list.size; index++) {
|
||||
@ -85,7 +85,7 @@ yp_ast_new(yp_parser_t *parser, yp_node_t *node, rb_encoding *encoding) {
|
||||
yp_constant_t constant = parser->constant_pool.constants[index];
|
||||
|
||||
if (constant.id != 0) {
|
||||
constants[constant.id - 1] = rb_intern3(constant.start, constant.length, encoding);
|
||||
constants[constant.id - 1] = rb_intern3((const char *) constant.start, constant.length, encoding);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -21,15 +21,15 @@ typedef enum yp_token_type {
|
||||
// type and location information.
|
||||
typedef struct {
|
||||
yp_token_type_t type;
|
||||
const char *start;
|
||||
const char *end;
|
||||
const uint8_t *start;
|
||||
const uint8_t *end;
|
||||
} yp_token_t;
|
||||
|
||||
// This represents a range of bytes in the source string to which a node or
|
||||
// token corresponds.
|
||||
typedef struct {
|
||||
const char *start;
|
||||
const char *end;
|
||||
const uint8_t *start;
|
||||
const uint8_t *end;
|
||||
} yp_location_t;
|
||||
|
||||
typedef struct {
|
||||
|
@ -36,7 +36,7 @@ prettyprint_node(yp_buffer_t *buffer, yp_parser_t *parser, yp_node_t *node) {
|
||||
}
|
||||
<%- when StringParam -%>
|
||||
yp_buffer_append_str(buffer, "\"", 1);
|
||||
yp_buffer_append_str(buffer, yp_string_source(&((yp_<%= node.human %>_t *)node)-><%= param.name %>), yp_string_length(&((yp_<%= node.human %>_t *)node)-><%= param.name %>));
|
||||
yp_buffer_append_bytes(buffer, yp_string_source(&((yp_<%= node.human %>_t *)node)-><%= param.name %>), yp_string_length(&((yp_<%= node.human %>_t *)node)-><%= param.name %>));
|
||||
yp_buffer_append_str(buffer, "\"", 1);
|
||||
<%- when NodeListParam -%>
|
||||
yp_buffer_append_str(buffer, "[", 1);
|
||||
|
@ -38,7 +38,7 @@ yp_serialize_string(yp_parser_t *parser, yp_string_t *string, yp_buffer_t *buffe
|
||||
uint32_t length = yp_sizet_to_u32(yp_string_length(string));
|
||||
yp_buffer_append_u8(buffer, 2);
|
||||
yp_buffer_append_u32(buffer, length);
|
||||
yp_buffer_append_str(buffer, yp_string_source(string), length);
|
||||
yp_buffer_append_bytes(buffer, yp_string_source(string), length);
|
||||
break;
|
||||
}
|
||||
case YP_STRING_MAPPED:
|
||||
@ -234,7 +234,7 @@ serialize_token(void *data, yp_parser_t *parser, yp_token_t *token) {
|
||||
}
|
||||
|
||||
YP_EXPORTED_FUNCTION void
|
||||
yp_lex_serialize(const char *source, size_t size, const char *filepath, yp_buffer_t *buffer) {
|
||||
yp_lex_serialize(const uint8_t *source, size_t size, const char *filepath, yp_buffer_t *buffer) {
|
||||
yp_parser_t parser;
|
||||
yp_parser_init(&parser, source, size, filepath);
|
||||
|
||||
@ -261,7 +261,7 @@ yp_lex_serialize(const char *source, size_t size, const char *filepath, yp_buffe
|
||||
// Parse and serialize both the AST and the tokens represented by the given
|
||||
// source to the given buffer.
|
||||
YP_EXPORTED_FUNCTION void
|
||||
yp_parse_lex_serialize(const char *source, size_t size, yp_buffer_t *buffer, const char *metadata) {
|
||||
yp_parse_lex_serialize(const uint8_t *source, size_t size, yp_buffer_t *buffer, const char *metadata) {
|
||||
yp_parser_t parser;
|
||||
yp_parser_init(&parser, source, size, NULL);
|
||||
if (metadata) yp_parser_metadata(&parser, metadata);
|
||||
|
133
yarp/unescape.c
133
yarp/unescape.c
@ -5,9 +5,9 @@
|
||||
/******************************************************************************/
|
||||
|
||||
static inline bool
|
||||
yp_char_is_hexadecimal_digits(const char *c, size_t length) {
|
||||
yp_char_is_hexadecimal_digits(const uint8_t *string, size_t length) {
|
||||
for (size_t index = 0; index < length; index++) {
|
||||
if (!yp_char_is_hexadecimal_digit(c[index])) {
|
||||
if (!yp_char_is_hexadecimal_digit(string[index])) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
@ -18,10 +18,8 @@ yp_char_is_hexadecimal_digits(const char *c, size_t length) {
|
||||
// expensive to go through the indirection of the function pointer. Instead we
|
||||
// provide a fast path that will check if we can just return 1.
|
||||
static inline size_t
|
||||
yp_char_width(yp_parser_t *parser, const char *start, const char *end) {
|
||||
const unsigned char *uc = (const unsigned char *) start;
|
||||
|
||||
if (parser->encoding_changed || (*uc >= 0x80)) {
|
||||
yp_char_width(yp_parser_t *parser, const uint8_t *start, const uint8_t *end) {
|
||||
if (parser->encoding_changed || (*start >= 0x80)) {
|
||||
return parser->encoding.char_width(start, end - start);
|
||||
} else {
|
||||
return 1;
|
||||
@ -33,7 +31,7 @@ yp_char_width(yp_parser_t *parser, const char *start, const char *end) {
|
||||
/******************************************************************************/
|
||||
|
||||
// This is a lookup table for unescapes that only take up a single character.
|
||||
static const unsigned char unescape_chars[] = {
|
||||
static const uint8_t unescape_chars[] = {
|
||||
['\''] = '\'',
|
||||
['\\'] = '\\',
|
||||
['a'] = '\a',
|
||||
@ -60,9 +58,8 @@ static const bool ascii_printable_chars[] = {
|
||||
};
|
||||
|
||||
static inline bool
|
||||
char_is_ascii_printable(const char c) {
|
||||
unsigned char v = (unsigned char) c;
|
||||
return (v < 0x80) && ascii_printable_chars[v];
|
||||
char_is_ascii_printable(const uint8_t b) {
|
||||
return (b < 0x80) && ascii_printable_chars[b];
|
||||
}
|
||||
|
||||
/******************************************************************************/
|
||||
@ -72,37 +69,37 @@ char_is_ascii_printable(const char c) {
|
||||
// Scan the 1-3 digits of octal into the value. Returns the number of digits
|
||||
// scanned.
|
||||
static inline size_t
|
||||
unescape_octal(const char *backslash, unsigned char *value) {
|
||||
*value = (unsigned char) (backslash[1] - '0');
|
||||
unescape_octal(const uint8_t *backslash, uint8_t *value) {
|
||||
*value = (uint8_t) (backslash[1] - '0');
|
||||
if (!yp_char_is_octal_digit(backslash[2])) {
|
||||
return 2;
|
||||
}
|
||||
|
||||
*value = (unsigned char) ((*value << 3) | (backslash[2] - '0'));
|
||||
*value = (uint8_t) ((*value << 3) | (backslash[2] - '0'));
|
||||
if (!yp_char_is_octal_digit(backslash[3])) {
|
||||
return 3;
|
||||
}
|
||||
|
||||
*value = (unsigned char) ((*value << 3) | (backslash[3] - '0'));
|
||||
*value = (uint8_t) ((*value << 3) | (backslash[3] - '0'));
|
||||
return 4;
|
||||
}
|
||||
|
||||
// Convert a hexadecimal digit into its equivalent value.
|
||||
static inline unsigned char
|
||||
unescape_hexadecimal_digit(const char value) {
|
||||
return (unsigned char) ((value <= '9') ? (value - '0') : (value & 0x7) + 9);
|
||||
static inline uint8_t
|
||||
unescape_hexadecimal_digit(const uint8_t value) {
|
||||
return (uint8_t) ((value <= '9') ? (value - '0') : (value & 0x7) + 9);
|
||||
}
|
||||
|
||||
// Scan the 1-2 digits of hexadecimal into the value. Returns the number of
|
||||
// digits scanned.
|
||||
static inline size_t
|
||||
unescape_hexadecimal(const char *backslash, unsigned char *value) {
|
||||
unescape_hexadecimal(const uint8_t *backslash, uint8_t *value) {
|
||||
*value = unescape_hexadecimal_digit(backslash[2]);
|
||||
if (!yp_char_is_hexadecimal_digit(backslash[3])) {
|
||||
return 3;
|
||||
}
|
||||
|
||||
*value = (unsigned char) ((*value << 4) | unescape_hexadecimal_digit(backslash[3]));
|
||||
*value = (uint8_t) ((*value << 4) | unescape_hexadecimal_digit(backslash[3]));
|
||||
return 4;
|
||||
}
|
||||
|
||||
@ -110,7 +107,7 @@ unescape_hexadecimal(const char *backslash, unsigned char *value) {
|
||||
// digits scanned. This function assumes that the characters have already been
|
||||
// validated.
|
||||
static inline void
|
||||
unescape_unicode(const char *string, size_t length, uint32_t *value) {
|
||||
unescape_unicode(const uint8_t *string, size_t length, uint32_t *value) {
|
||||
*value = 0;
|
||||
for (size_t index = 0; index < length; index++) {
|
||||
if (index != 0) *value <<= 4;
|
||||
@ -122,27 +119,25 @@ unescape_unicode(const char *string, size_t length, uint32_t *value) {
|
||||
// 32-bit value to write. Writes the UTF-8 representation of the value to the
|
||||
// string and returns the number of bytes written.
|
||||
static inline size_t
|
||||
unescape_unicode_write(char *dest, uint32_t value, const char *start, const char *end, yp_list_t *error_list) {
|
||||
unsigned char *bytes = (unsigned char *) dest;
|
||||
|
||||
unescape_unicode_write(uint8_t *dest, uint32_t value, const uint8_t *start, const uint8_t *end, yp_list_t *error_list) {
|
||||
if (value <= 0x7F) {
|
||||
// 0xxxxxxx
|
||||
bytes[0] = (unsigned char) value;
|
||||
dest[0] = (uint8_t) value;
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (value <= 0x7FF) {
|
||||
// 110xxxxx 10xxxxxx
|
||||
bytes[0] = (unsigned char) (0xC0 | (value >> 6));
|
||||
bytes[1] = (unsigned char) (0x80 | (value & 0x3F));
|
||||
dest[0] = (uint8_t) (0xC0 | (value >> 6));
|
||||
dest[1] = (uint8_t) (0x80 | (value & 0x3F));
|
||||
return 2;
|
||||
}
|
||||
|
||||
if (value <= 0xFFFF) {
|
||||
// 1110xxxx 10xxxxxx 10xxxxxx
|
||||
bytes[0] = (unsigned char) (0xE0 | (value >> 12));
|
||||
bytes[1] = (unsigned char) (0x80 | ((value >> 6) & 0x3F));
|
||||
bytes[2] = (unsigned char) (0x80 | (value & 0x3F));
|
||||
dest[0] = (uint8_t) (0xE0 | (value >> 12));
|
||||
dest[1] = (uint8_t) (0x80 | ((value >> 6) & 0x3F));
|
||||
dest[2] = (uint8_t) (0x80 | (value & 0x3F));
|
||||
return 3;
|
||||
}
|
||||
|
||||
@ -150,10 +145,10 @@ unescape_unicode_write(char *dest, uint32_t value, const char *start, const char
|
||||
// the input is invalid.
|
||||
if (value <= 0x10FFFF) {
|
||||
// 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
|
||||
bytes[0] = (unsigned char) (0xF0 | (value >> 18));
|
||||
bytes[1] = (unsigned char) (0x80 | ((value >> 12) & 0x3F));
|
||||
bytes[2] = (unsigned char) (0x80 | ((value >> 6) & 0x3F));
|
||||
bytes[3] = (unsigned char) (0x80 | (value & 0x3F));
|
||||
dest[0] = (uint8_t) (0xF0 | (value >> 18));
|
||||
dest[1] = (uint8_t) (0x80 | ((value >> 12) & 0x3F));
|
||||
dest[2] = (uint8_t) (0x80 | ((value >> 6) & 0x3F));
|
||||
dest[3] = (uint8_t) (0x80 | (value & 0x3F));
|
||||
return 4;
|
||||
}
|
||||
|
||||
@ -161,9 +156,9 @@ unescape_unicode_write(char *dest, uint32_t value, const char *start, const char
|
||||
// want to just crash, so instead we'll add an error to the error list and put
|
||||
// in a replacement character instead.
|
||||
yp_diagnostic_list_append(error_list, start, end, "Invalid Unicode escape sequence.");
|
||||
bytes[0] = 0xEF;
|
||||
bytes[1] = 0xBF;
|
||||
bytes[2] = 0xBD;
|
||||
dest[0] = 0xEF;
|
||||
dest[1] = 0xBF;
|
||||
dest[2] = 0xBD;
|
||||
return 3;
|
||||
}
|
||||
|
||||
@ -175,24 +170,22 @@ typedef enum {
|
||||
} yp_unescape_flag_t;
|
||||
|
||||
// Unescape a single character value based on the given flags.
|
||||
static inline unsigned char
|
||||
unescape_char(const unsigned char value, const unsigned char flags) {
|
||||
unsigned char unescaped = value;
|
||||
|
||||
static inline uint8_t
|
||||
unescape_char(uint8_t value, const uint8_t flags) {
|
||||
if (flags & YP_UNESCAPE_FLAG_CONTROL) {
|
||||
unescaped &= 0x1f;
|
||||
value &= 0x1f;
|
||||
}
|
||||
|
||||
if (flags & YP_UNESCAPE_FLAG_META) {
|
||||
unescaped |= 0x80;
|
||||
value |= 0x80;
|
||||
}
|
||||
|
||||
return unescaped;
|
||||
return value;
|
||||
}
|
||||
|
||||
// Read a specific escape sequence into the given destination.
|
||||
static const char *
|
||||
unescape(yp_parser_t *parser, char *dest, size_t *dest_length, const char *backslash, const char *end, const unsigned char flags, bool write_to_str) {
|
||||
static const uint8_t *
|
||||
unescape(yp_parser_t *parser, uint8_t *dest, size_t *dest_length, const uint8_t *backslash, const uint8_t *end, const uint8_t flags, bool write_to_str) {
|
||||
switch (backslash[1]) {
|
||||
case 'a':
|
||||
case 'b':
|
||||
@ -204,27 +197,27 @@ unescape(yp_parser_t *parser, char *dest, size_t *dest_length, const char *backs
|
||||
case 't':
|
||||
case 'v':
|
||||
if (write_to_str) {
|
||||
dest[(*dest_length)++] = (char) unescape_char(unescape_chars[(unsigned char) backslash[1]], flags);
|
||||
dest[(*dest_length)++] = unescape_char(unescape_chars[backslash[1]], flags);
|
||||
}
|
||||
return backslash + 2;
|
||||
// \nnn octal bit pattern, where nnn is 1-3 octal digits ([0-7])
|
||||
case '0': case '1': case '2': case '3': case '4':
|
||||
case '5': case '6': case '7': case '8': case '9': {
|
||||
unsigned char value;
|
||||
const char *cursor = backslash + unescape_octal(backslash, &value);
|
||||
uint8_t value;
|
||||
const uint8_t *cursor = backslash + unescape_octal(backslash, &value);
|
||||
|
||||
if (write_to_str) {
|
||||
dest[(*dest_length)++] = (char) unescape_char(value, flags);
|
||||
dest[(*dest_length)++] = unescape_char(value, flags);
|
||||
}
|
||||
return cursor;
|
||||
}
|
||||
// \xnn hexadecimal bit pattern, where nn is 1-2 hexadecimal digits ([0-9a-fA-F])
|
||||
case 'x': {
|
||||
unsigned char value;
|
||||
const char *cursor = backslash + unescape_hexadecimal(backslash, &value);
|
||||
uint8_t value;
|
||||
const uint8_t *cursor = backslash + unescape_hexadecimal(backslash, &value);
|
||||
|
||||
if (write_to_str) {
|
||||
dest[(*dest_length)++] = (char) unescape_char(value, flags);
|
||||
dest[(*dest_length)++] = unescape_char(value, flags);
|
||||
}
|
||||
return cursor;
|
||||
}
|
||||
@ -237,14 +230,14 @@ unescape(yp_parser_t *parser, char *dest, size_t *dest_length, const char *backs
|
||||
}
|
||||
|
||||
if ((backslash + 3) < end && backslash[2] == '{') {
|
||||
const char *unicode_cursor = backslash + 3;
|
||||
const char *extra_codepoints_start = NULL;
|
||||
const uint8_t *unicode_cursor = backslash + 3;
|
||||
const uint8_t *extra_codepoints_start = NULL;
|
||||
int codepoints_count = 0;
|
||||
|
||||
unicode_cursor += yp_strspn_whitespace(unicode_cursor, end - unicode_cursor);
|
||||
|
||||
while ((*unicode_cursor != '}') && (unicode_cursor < end)) {
|
||||
const char *unicode_start = unicode_cursor;
|
||||
const uint8_t *unicode_start = unicode_cursor;
|
||||
size_t hexadecimal_length = yp_strspn_hexadecimal_digit(unicode_cursor, end - unicode_cursor);
|
||||
|
||||
// \u{nnnn} character literal allows only 1-6 hexadecimal digits
|
||||
@ -311,7 +304,7 @@ unescape(yp_parser_t *parser, char *dest, size_t *dest_length, const char *backs
|
||||
return unescape(parser, dest, dest_length, backslash + 2, end, flags | YP_UNESCAPE_FLAG_CONTROL, write_to_str);
|
||||
case '?':
|
||||
if (write_to_str) {
|
||||
dest[(*dest_length)++] = (char) unescape_char(0x7f, flags);
|
||||
dest[(*dest_length)++] = unescape_char(0x7f, flags);
|
||||
}
|
||||
return backslash + 3;
|
||||
default: {
|
||||
@ -321,7 +314,7 @@ unescape(yp_parser_t *parser, char *dest, size_t *dest_length, const char *backs
|
||||
}
|
||||
|
||||
if (write_to_str) {
|
||||
dest[(*dest_length)++] = (char) unescape_char((const unsigned char) backslash[2], flags | YP_UNESCAPE_FLAG_CONTROL);
|
||||
dest[(*dest_length)++] = unescape_char(backslash[2], flags | YP_UNESCAPE_FLAG_CONTROL);
|
||||
}
|
||||
return backslash + 3;
|
||||
}
|
||||
@ -349,7 +342,7 @@ unescape(yp_parser_t *parser, char *dest, size_t *dest_length, const char *backs
|
||||
return unescape(parser, dest, dest_length, backslash + 3, end, flags | YP_UNESCAPE_FLAG_CONTROL, write_to_str);
|
||||
case '?':
|
||||
if (write_to_str) {
|
||||
dest[(*dest_length)++] = (char) unescape_char(0x7f, flags);
|
||||
dest[(*dest_length)++] = unescape_char(0x7f, flags);
|
||||
}
|
||||
return backslash + 4;
|
||||
default:
|
||||
@ -359,7 +352,7 @@ unescape(yp_parser_t *parser, char *dest, size_t *dest_length, const char *backs
|
||||
}
|
||||
|
||||
if (write_to_str) {
|
||||
dest[(*dest_length)++] = (char) unescape_char((const unsigned char) backslash[3], flags | YP_UNESCAPE_FLAG_CONTROL);
|
||||
dest[(*dest_length)++] = unescape_char(backslash[3], flags | YP_UNESCAPE_FLAG_CONTROL);
|
||||
}
|
||||
return backslash + 4;
|
||||
}
|
||||
@ -388,7 +381,7 @@ unescape(yp_parser_t *parser, char *dest, size_t *dest_length, const char *backs
|
||||
|
||||
if (char_is_ascii_printable(backslash[3])) {
|
||||
if (write_to_str) {
|
||||
dest[(*dest_length)++] = (char) unescape_char((const unsigned char) backslash[3], flags | YP_UNESCAPE_FLAG_META);
|
||||
dest[(*dest_length)++] = unescape_char(backslash[3], flags | YP_UNESCAPE_FLAG_META);
|
||||
}
|
||||
return backslash + 4;
|
||||
}
|
||||
@ -454,7 +447,7 @@ yp_unescape_manipulate_string(yp_parser_t *parser, yp_string_t *string, yp_unesc
|
||||
return;
|
||||
}
|
||||
|
||||
const char *backslash = yp_memchr(string->source, '\\', string->length, parser->encoding_changed, &parser->encoding);
|
||||
const uint8_t *backslash = yp_memchr(string->source, '\\', string->length, parser->encoding_changed, &parser->encoding);
|
||||
|
||||
if (backslash == NULL) {
|
||||
// Here there are no escapes, so we can reference the source directly.
|
||||
@ -463,21 +456,21 @@ yp_unescape_manipulate_string(yp_parser_t *parser, yp_string_t *string, yp_unesc
|
||||
|
||||
// Here we have found an escape character, so we need to handle all escapes
|
||||
// within the string.
|
||||
char *allocated = malloc(string->length);
|
||||
uint8_t *allocated = malloc(string->length);
|
||||
if (allocated == NULL) {
|
||||
yp_diagnostic_list_append(&parser->error_list, string->source, string->source + string->length, "Failed to allocate memory for unescaping.");
|
||||
return;
|
||||
}
|
||||
|
||||
// This is the memory address where we're putting the unescaped string.
|
||||
char *dest = allocated;
|
||||
uint8_t *dest = allocated;
|
||||
size_t dest_length = 0;
|
||||
|
||||
// This is the current position in the source string that we're looking at.
|
||||
// It's going to move along behind the backslash so that we can copy each
|
||||
// segment of the string that doesn't contain an escape.
|
||||
const char *cursor = string->source;
|
||||
const char *end = string->source + string->length;
|
||||
const uint8_t *cursor = string->source;
|
||||
const uint8_t *end = string->source + string->length;
|
||||
|
||||
// For each escape found in the source string, we will handle it and update
|
||||
// the moving cursor->backslash window.
|
||||
@ -496,7 +489,7 @@ yp_unescape_manipulate_string(yp_parser_t *parser, yp_string_t *string, yp_unesc
|
||||
switch (backslash[1]) {
|
||||
case '\\':
|
||||
case '\'':
|
||||
dest[dest_length++] = (char) unescape_chars[(unsigned char) backslash[1]];
|
||||
dest[dest_length++] = unescape_chars[backslash[1]];
|
||||
cursor = backslash + 2;
|
||||
break;
|
||||
default:
|
||||
@ -542,7 +535,7 @@ yp_unescape_manipulate_string(yp_parser_t *parser, yp_string_t *string, yp_unesc
|
||||
// actually perform any string manipulations. Instead, it calculates how long
|
||||
// the unescaped character is, and returns that value
|
||||
size_t
|
||||
yp_unescape_calculate_difference(yp_parser_t *parser, const char *backslash, yp_unescape_type_t unescape_type, bool expect_single_codepoint) {
|
||||
yp_unescape_calculate_difference(yp_parser_t *parser, const uint8_t *backslash, yp_unescape_type_t unescape_type, bool expect_single_codepoint) {
|
||||
assert(unescape_type != YP_UNESCAPE_NONE);
|
||||
|
||||
switch (backslash[1]) {
|
||||
@ -558,11 +551,11 @@ yp_unescape_calculate_difference(yp_parser_t *parser, const char *backslash, yp_
|
||||
// handle all of the different unescapes.
|
||||
assert(unescape_type == YP_UNESCAPE_ALL);
|
||||
|
||||
unsigned char flags = YP_UNESCAPE_FLAG_NONE;
|
||||
uint8_t flags = YP_UNESCAPE_FLAG_NONE;
|
||||
if (expect_single_codepoint)
|
||||
flags |= YP_UNESCAPE_FLAG_EXPECT_SINGLE;
|
||||
|
||||
const char *cursor = unescape(parser, NULL, 0, backslash, parser->end, flags, false);
|
||||
const uint8_t *cursor = unescape(parser, NULL, 0, backslash, parser->end, flags, false);
|
||||
assert(cursor > backslash);
|
||||
|
||||
return (size_t) (cursor - backslash);
|
||||
@ -574,7 +567,7 @@ yp_unescape_calculate_difference(yp_parser_t *parser, const char *backslash, yp_
|
||||
// string, a type of unescaping, and a pointer to a result string. It returns a
|
||||
// boolean indicating whether or not the unescaping was successful.
|
||||
YP_EXPORTED_FUNCTION bool
|
||||
yp_unescape_string(const char *start, size_t length, yp_unescape_type_t unescape_type, yp_string_t *result) {
|
||||
yp_unescape_string(const uint8_t *start, size_t length, yp_unescape_type_t unescape_type, yp_string_t *result) {
|
||||
yp_parser_t parser;
|
||||
yp_parser_init(&parser, start, length, NULL);
|
||||
|
||||
|
@ -35,10 +35,10 @@ YP_EXPORTED_FUNCTION void yp_unescape_manipulate_string(yp_parser_t *parser, yp_
|
||||
|
||||
// Accepts a source string and a type of unescaping and returns the unescaped version.
|
||||
// The caller must yp_string_free(result); after calling this function.
|
||||
YP_EXPORTED_FUNCTION bool yp_unescape_string(const char *start, size_t length, yp_unescape_type_t unescape_type, yp_string_t *result);
|
||||
YP_EXPORTED_FUNCTION bool yp_unescape_string(const uint8_t *start, size_t length, yp_unescape_type_t unescape_type, yp_string_t *result);
|
||||
|
||||
// Returns the number of bytes that encompass the first escape sequence in the
|
||||
// given string.
|
||||
size_t yp_unescape_calculate_difference(yp_parser_t *parser, const char *value, yp_unescape_type_t unescape_type, bool expect_single_codepoint);
|
||||
size_t yp_unescape_calculate_difference(yp_parser_t *parser, const uint8_t *value, yp_unescape_type_t unescape_type, bool expect_single_codepoint);
|
||||
|
||||
#endif
|
||||
|
@ -63,8 +63,13 @@ yp_buffer_append_zeroes(yp_buffer_t *buffer, size_t length) {
|
||||
// Append a string to the buffer.
|
||||
void
|
||||
yp_buffer_append_str(yp_buffer_t *buffer, const char *value, size_t length) {
|
||||
const void *source = value;
|
||||
yp_buffer_append(buffer, source, length);
|
||||
yp_buffer_append(buffer, value, length);
|
||||
}
|
||||
|
||||
// Append a list of bytes to the buffer.
|
||||
void
|
||||
yp_buffer_append_bytes(yp_buffer_t *buffer, const uint8_t *value, size_t length) {
|
||||
yp_buffer_append(buffer, (const char *) value, length);
|
||||
}
|
||||
|
||||
// Append a single byte to the buffer.
|
||||
|
@ -36,6 +36,9 @@ void yp_buffer_append_zeroes(yp_buffer_t *buffer, size_t length);
|
||||
// Append a string to the buffer.
|
||||
void yp_buffer_append_str(yp_buffer_t *buffer, const char *value, size_t length);
|
||||
|
||||
// Append a list of bytes to the buffer.
|
||||
void yp_buffer_append_bytes(yp_buffer_t *buffer, const uint8_t *value, size_t length);
|
||||
|
||||
// Append a single byte to the buffer.
|
||||
void yp_buffer_append_u8(yp_buffer_t *buffer, uint8_t value);
|
||||
|
||||
|
@ -13,8 +13,8 @@
|
||||
#define YP_NUMBER_BIT_HEXADECIMAL_DIGIT (1 << 6)
|
||||
#define YP_NUMBER_BIT_HEXADECIMAL_NUMBER (1 << 7)
|
||||
|
||||
static const unsigned char yp_char_table[256] = {
|
||||
//0 1 2 3 4 5 6 7 8 9 A B C D E F
|
||||
static const uint8_t yp_byte_table[256] = {
|
||||
// 0 1 2 3 4 5 6 7 8 9 A B C D E F
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3, 0, 0, // 0x
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 1x
|
||||
3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 2x
|
||||
@ -33,7 +33,7 @@ static const unsigned char yp_char_table[256] = {
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Fx
|
||||
};
|
||||
|
||||
static const unsigned char yp_number_table[256] = {
|
||||
static const uint8_t yp_number_table[256] = {
|
||||
// 0 1 2 3 4 5 6 7 8 9 A B C D E F
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // 0x
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // 1x
|
||||
@ -54,20 +54,20 @@ static const unsigned char yp_number_table[256] = {
|
||||
};
|
||||
|
||||
static inline size_t
|
||||
yp_strspn_char_kind(const char *string, ptrdiff_t length, unsigned char kind) {
|
||||
yp_strspn_char_kind(const uint8_t *string, ptrdiff_t length, uint8_t kind) {
|
||||
if (length <= 0) return 0;
|
||||
|
||||
size_t size = 0;
|
||||
size_t maximum = (size_t) length;
|
||||
|
||||
while (size < maximum && (yp_char_table[(unsigned char) string[size]] & kind)) size++;
|
||||
while (size < maximum && (yp_byte_table[string[size]] & kind)) size++;
|
||||
return size;
|
||||
}
|
||||
|
||||
// Returns the number of characters at the start of the string that are
|
||||
// whitespace. Disallows searching past the given maximum number of characters.
|
||||
size_t
|
||||
yp_strspn_whitespace(const char *string, ptrdiff_t length) {
|
||||
yp_strspn_whitespace(const uint8_t *string, ptrdiff_t length) {
|
||||
return yp_strspn_char_kind(string, length, YP_CHAR_BIT_WHITESPACE);
|
||||
}
|
||||
|
||||
@ -75,13 +75,13 @@ yp_strspn_whitespace(const char *string, ptrdiff_t length) {
|
||||
// whitespace while also tracking the location of each newline. Disallows
|
||||
// searching past the given maximum number of characters.
|
||||
size_t
|
||||
yp_strspn_whitespace_newlines(const char *string, ptrdiff_t length, yp_newline_list_t *newline_list, bool stop_at_newline) {
|
||||
yp_strspn_whitespace_newlines(const uint8_t *string, ptrdiff_t length, yp_newline_list_t *newline_list, bool stop_at_newline) {
|
||||
if (length <= 0) return 0;
|
||||
|
||||
size_t size = 0;
|
||||
size_t maximum = (size_t) length;
|
||||
|
||||
while (size < maximum && (yp_char_table[(unsigned char) string[size]] & YP_CHAR_BIT_WHITESPACE)) {
|
||||
while (size < maximum && (yp_byte_table[string[size]] & YP_CHAR_BIT_WHITESPACE)) {
|
||||
if (string[size] == '\n') {
|
||||
if (stop_at_newline) {
|
||||
return size + 1;
|
||||
@ -100,42 +100,42 @@ yp_strspn_whitespace_newlines(const char *string, ptrdiff_t length, yp_newline_l
|
||||
// Returns the number of characters at the start of the string that are inline
|
||||
// whitespace. Disallows searching past the given maximum number of characters.
|
||||
size_t
|
||||
yp_strspn_inline_whitespace(const char *string, ptrdiff_t length) {
|
||||
yp_strspn_inline_whitespace(const uint8_t *string, ptrdiff_t length) {
|
||||
return yp_strspn_char_kind(string, length, YP_CHAR_BIT_INLINE_WHITESPACE);
|
||||
}
|
||||
|
||||
// Returns the number of characters at the start of the string that are regexp
|
||||
// options. Disallows searching past the given maximum number of characters.
|
||||
size_t
|
||||
yp_strspn_regexp_option(const char *string, ptrdiff_t length) {
|
||||
yp_strspn_regexp_option(const uint8_t *string, ptrdiff_t length) {
|
||||
return yp_strspn_char_kind(string, length, YP_CHAR_BIT_REGEXP_OPTION);
|
||||
}
|
||||
|
||||
static inline bool
|
||||
yp_char_is_char_kind(const char c, unsigned char kind) {
|
||||
return (yp_char_table[(unsigned char) c] & kind) != 0;
|
||||
yp_char_is_char_kind(const uint8_t b, uint8_t kind) {
|
||||
return (yp_byte_table[b] & kind) != 0;
|
||||
}
|
||||
|
||||
// Returns true if the given character is a whitespace character.
|
||||
bool
|
||||
yp_char_is_whitespace(const char c) {
|
||||
return yp_char_is_char_kind(c, YP_CHAR_BIT_WHITESPACE);
|
||||
yp_char_is_whitespace(const uint8_t b) {
|
||||
return yp_char_is_char_kind(b, YP_CHAR_BIT_WHITESPACE);
|
||||
}
|
||||
|
||||
// Returns true if the given character is an inline whitespace character.
|
||||
bool
|
||||
yp_char_is_inline_whitespace(const char c) {
|
||||
return yp_char_is_char_kind(c, YP_CHAR_BIT_INLINE_WHITESPACE);
|
||||
yp_char_is_inline_whitespace(const uint8_t b) {
|
||||
return yp_char_is_char_kind(b, YP_CHAR_BIT_INLINE_WHITESPACE);
|
||||
}
|
||||
|
||||
static inline size_t
|
||||
yp_strspn_number_kind(const char *string, ptrdiff_t length, unsigned char kind) {
|
||||
yp_strspn_number_kind(const uint8_t *string, ptrdiff_t length, uint8_t kind) {
|
||||
if (length <= 0) return 0;
|
||||
|
||||
size_t size = 0;
|
||||
size_t maximum = (size_t) length;
|
||||
|
||||
while (size < maximum && (yp_number_table[(unsigned char) string[size]] & kind)) size++;
|
||||
while (size < maximum && (yp_number_table[string[size]] & kind)) size++;
|
||||
return size;
|
||||
}
|
||||
|
||||
@ -143,7 +143,7 @@ yp_strspn_number_kind(const char *string, ptrdiff_t length, unsigned char kind)
|
||||
// digits or underscores. Disallows searching past the given maximum number of
|
||||
// characters.
|
||||
size_t
|
||||
yp_strspn_binary_number(const char *string, ptrdiff_t length) {
|
||||
yp_strspn_binary_number(const uint8_t *string, ptrdiff_t length) {
|
||||
return yp_strspn_number_kind(string, length, YP_NUMBER_BIT_BINARY_NUMBER);
|
||||
}
|
||||
|
||||
@ -151,14 +151,14 @@ yp_strspn_binary_number(const char *string, ptrdiff_t length) {
|
||||
// digits or underscores. Disallows searching past the given maximum number of
|
||||
// characters.
|
||||
size_t
|
||||
yp_strspn_octal_number(const char *string, ptrdiff_t length) {
|
||||
yp_strspn_octal_number(const uint8_t *string, ptrdiff_t length) {
|
||||
return yp_strspn_number_kind(string, length, YP_NUMBER_BIT_OCTAL_NUMBER);
|
||||
}
|
||||
|
||||
// Returns the number of characters at the start of the string that are decimal
|
||||
// digits. Disallows searching past the given maximum number of characters.
|
||||
size_t
|
||||
yp_strspn_decimal_digit(const char *string, ptrdiff_t length) {
|
||||
yp_strspn_decimal_digit(const uint8_t *string, ptrdiff_t length) {
|
||||
return yp_strspn_number_kind(string, length, YP_NUMBER_BIT_DECIMAL_DIGIT);
|
||||
}
|
||||
|
||||
@ -166,7 +166,7 @@ yp_strspn_decimal_digit(const char *string, ptrdiff_t length) {
|
||||
// digits or underscores. Disallows searching past the given maximum number of
|
||||
// characters.
|
||||
size_t
|
||||
yp_strspn_decimal_number(const char *string, ptrdiff_t length) {
|
||||
yp_strspn_decimal_number(const uint8_t *string, ptrdiff_t length) {
|
||||
return yp_strspn_number_kind(string, length, YP_NUMBER_BIT_DECIMAL_NUMBER);
|
||||
}
|
||||
|
||||
@ -174,7 +174,7 @@ yp_strspn_decimal_number(const char *string, ptrdiff_t length) {
|
||||
// hexadecimal digits. Disallows searching past the given maximum number of
|
||||
// characters.
|
||||
size_t
|
||||
yp_strspn_hexadecimal_digit(const char *string, ptrdiff_t length) {
|
||||
yp_strspn_hexadecimal_digit(const uint8_t *string, ptrdiff_t length) {
|
||||
return yp_strspn_number_kind(string, length, YP_NUMBER_BIT_HEXADECIMAL_DIGIT);
|
||||
}
|
||||
|
||||
@ -182,37 +182,37 @@ yp_strspn_hexadecimal_digit(const char *string, ptrdiff_t length) {
|
||||
// hexadecimal digits or underscores. Disallows searching past the given maximum
|
||||
// number of characters.
|
||||
size_t
|
||||
yp_strspn_hexadecimal_number(const char *string, ptrdiff_t length) {
|
||||
yp_strspn_hexadecimal_number(const uint8_t *string, ptrdiff_t length) {
|
||||
return yp_strspn_number_kind(string, length, YP_NUMBER_BIT_HEXADECIMAL_NUMBER);
|
||||
}
|
||||
|
||||
static inline bool
|
||||
yp_char_is_number_kind(const char c, unsigned char kind) {
|
||||
return (yp_number_table[(unsigned char) c] & kind) != 0;
|
||||
yp_char_is_number_kind(const uint8_t b, uint8_t kind) {
|
||||
return (yp_number_table[b] & kind) != 0;
|
||||
}
|
||||
|
||||
// Returns true if the given character is a binary digit.
|
||||
bool
|
||||
yp_char_is_binary_digit(const char c) {
|
||||
return yp_char_is_number_kind(c, YP_NUMBER_BIT_BINARY_DIGIT);
|
||||
yp_char_is_binary_digit(const uint8_t b) {
|
||||
return yp_char_is_number_kind(b, YP_NUMBER_BIT_BINARY_DIGIT);
|
||||
}
|
||||
|
||||
// Returns true if the given character is an octal digit.
|
||||
bool
|
||||
yp_char_is_octal_digit(const char c) {
|
||||
return yp_char_is_number_kind(c, YP_NUMBER_BIT_OCTAL_DIGIT);
|
||||
yp_char_is_octal_digit(const uint8_t b) {
|
||||
return yp_char_is_number_kind(b, YP_NUMBER_BIT_OCTAL_DIGIT);
|
||||
}
|
||||
|
||||
// Returns true if the given character is a decimal digit.
|
||||
bool
|
||||
yp_char_is_decimal_digit(const char c) {
|
||||
return yp_char_is_number_kind(c, YP_NUMBER_BIT_DECIMAL_DIGIT);
|
||||
yp_char_is_decimal_digit(const uint8_t b) {
|
||||
return yp_char_is_number_kind(b, YP_NUMBER_BIT_DECIMAL_DIGIT);
|
||||
}
|
||||
|
||||
// Returns true if the given character is a hexadecimal digit.
|
||||
bool
|
||||
yp_char_is_hexadecimal_digit(const char c) {
|
||||
return yp_char_is_number_kind(c, YP_NUMBER_BIT_HEXADECIMAL_DIGIT);
|
||||
yp_char_is_hexadecimal_digit(const uint8_t b) {
|
||||
return yp_char_is_number_kind(b, YP_NUMBER_BIT_HEXADECIMAL_DIGIT);
|
||||
}
|
||||
|
||||
#undef YP_CHAR_BIT_WHITESPACE
|
||||
|
@ -9,67 +9,67 @@
|
||||
|
||||
// Returns the number of characters at the start of the string that are
|
||||
// whitespace. Disallows searching past the given maximum number of characters.
|
||||
size_t yp_strspn_whitespace(const char *string, ptrdiff_t length);
|
||||
size_t yp_strspn_whitespace(const uint8_t *string, ptrdiff_t length);
|
||||
|
||||
// Returns the number of characters at the start of the string that are
|
||||
// whitespace while also tracking the location of each newline. Disallows
|
||||
// searching past the given maximum number of characters.
|
||||
size_t
|
||||
yp_strspn_whitespace_newlines(const char *string, ptrdiff_t length, yp_newline_list_t *newline_list, bool);
|
||||
yp_strspn_whitespace_newlines(const uint8_t *string, ptrdiff_t length, yp_newline_list_t *newline_list, bool stop_at_newline);
|
||||
|
||||
// Returns the number of characters at the start of the string that are inline
|
||||
// whitespace. Disallows searching past the given maximum number of characters.
|
||||
size_t yp_strspn_inline_whitespace(const char *string, ptrdiff_t length);
|
||||
size_t yp_strspn_inline_whitespace(const uint8_t *string, ptrdiff_t length);
|
||||
|
||||
// Returns the number of characters at the start of the string that are decimal
|
||||
// digits. Disallows searching past the given maximum number of characters.
|
||||
size_t yp_strspn_decimal_digit(const char *string, ptrdiff_t length);
|
||||
size_t yp_strspn_decimal_digit(const uint8_t *string, ptrdiff_t length);
|
||||
|
||||
// Returns the number of characters at the start of the string that are
|
||||
// hexadecimal digits. Disallows searching past the given maximum number of
|
||||
// characters.
|
||||
size_t yp_strspn_hexadecimal_digit(const char *string, ptrdiff_t length);
|
||||
size_t yp_strspn_hexadecimal_digit(const uint8_t *string, ptrdiff_t length);
|
||||
|
||||
// Returns the number of characters at the start of the string that are octal
|
||||
// digits or underscores. Disallows searching past the given maximum number of
|
||||
// characters.
|
||||
size_t yp_strspn_octal_number(const char *string, ptrdiff_t length);
|
||||
size_t yp_strspn_octal_number(const uint8_t *string, ptrdiff_t length);
|
||||
|
||||
// Returns the number of characters at the start of the string that are decimal
|
||||
// digits or underscores. Disallows searching past the given maximum number of
|
||||
// characters.
|
||||
size_t yp_strspn_decimal_number(const char *string, ptrdiff_t length);
|
||||
size_t yp_strspn_decimal_number(const uint8_t *string, ptrdiff_t length);
|
||||
|
||||
// Returns the number of characters at the start of the string that are
|
||||
// hexadecimal digits or underscores. Disallows searching past the given maximum
|
||||
// number of characters.
|
||||
size_t yp_strspn_hexadecimal_number(const char *string, ptrdiff_t length);
|
||||
size_t yp_strspn_hexadecimal_number(const uint8_t *string, ptrdiff_t length);
|
||||
|
||||
// Returns the number of characters at the start of the string that are regexp
|
||||
// options. Disallows searching past the given maximum number of characters.
|
||||
size_t yp_strspn_regexp_option(const char *string, ptrdiff_t length);
|
||||
size_t yp_strspn_regexp_option(const uint8_t *string, ptrdiff_t length);
|
||||
|
||||
// Returns the number of characters at the start of the string that are binary
|
||||
// digits or underscores. Disallows searching past the given maximum number of
|
||||
// characters.
|
||||
size_t yp_strspn_binary_number(const char *string, ptrdiff_t length);
|
||||
size_t yp_strspn_binary_number(const uint8_t *string, ptrdiff_t length);
|
||||
|
||||
// Returns true if the given character is a whitespace character.
|
||||
bool yp_char_is_whitespace(const char c);
|
||||
bool yp_char_is_whitespace(const uint8_t b);
|
||||
|
||||
// Returns true if the given character is an inline whitespace character.
|
||||
bool yp_char_is_inline_whitespace(const char c);
|
||||
bool yp_char_is_inline_whitespace(const uint8_t b);
|
||||
|
||||
// Returns true if the given character is a binary digit.
|
||||
bool yp_char_is_binary_digit(const char c);
|
||||
bool yp_char_is_binary_digit(const uint8_t b);
|
||||
|
||||
// Returns true if the given character is an octal digit.
|
||||
bool yp_char_is_octal_digit(const char c);
|
||||
bool yp_char_is_octal_digit(const uint8_t b);
|
||||
|
||||
// Returns true if the given character is a decimal digit.
|
||||
bool yp_char_is_decimal_digit(const char c);
|
||||
bool yp_char_is_decimal_digit(const uint8_t b);
|
||||
|
||||
// Returns true if the given character is a hexadecimal digit.
|
||||
bool yp_char_is_hexadecimal_digit(const char c);
|
||||
bool yp_char_is_hexadecimal_digit(const uint8_t b);
|
||||
|
||||
#endif
|
||||
|
@ -48,12 +48,12 @@ yp_constant_id_list_free(yp_constant_id_list_t *list) {
|
||||
// A relatively simple hash function (djb2) that is used to hash strings. We are
|
||||
// optimizing here for simplicity and speed.
|
||||
static inline size_t
|
||||
yp_constant_pool_hash(const char *start, size_t length) {
|
||||
yp_constant_pool_hash(const uint8_t *start, size_t length) {
|
||||
// This is a prime number used as the initial value for the hash function.
|
||||
size_t value = 5381;
|
||||
|
||||
for (size_t index = 0; index < length; index++) {
|
||||
value = ((value << 5) + value) + ((unsigned char) start[index]);
|
||||
value = ((value << 5) + value) + start[index];
|
||||
}
|
||||
|
||||
return value;
|
||||
@ -109,7 +109,7 @@ yp_constant_pool_init(yp_constant_pool_t *pool, size_t capacity) {
|
||||
// Insert a constant into a constant pool. Returns the id of the constant, or 0
|
||||
// if any potential calls to resize fail.
|
||||
yp_constant_id_t
|
||||
yp_constant_pool_insert(yp_constant_pool_t *pool, const char *start, size_t length) {
|
||||
yp_constant_pool_insert(yp_constant_pool_t *pool, const uint8_t *start, size_t length) {
|
||||
if (pool->size >= (pool->capacity / 4 * 3)) {
|
||||
if (!yp_constant_pool_resize(pool)) return 0;
|
||||
}
|
||||
|
@ -40,7 +40,7 @@ void yp_constant_id_list_free(yp_constant_id_list_t *list);
|
||||
|
||||
typedef struct {
|
||||
yp_constant_id_t id;
|
||||
const char *start;
|
||||
const uint8_t *start;
|
||||
size_t length;
|
||||
size_t hash;
|
||||
} yp_constant_t;
|
||||
@ -59,7 +59,7 @@ bool yp_constant_pool_init(yp_constant_pool_t *pool, size_t capacity);
|
||||
|
||||
// Insert a constant into a constant pool. Returns the id of the constant, or 0
|
||||
// if any potential calls to resize fail.
|
||||
yp_constant_id_t yp_constant_pool_insert(yp_constant_pool_t *pool, const char *start, size_t length);
|
||||
yp_constant_id_t yp_constant_pool_insert(yp_constant_pool_t *pool, const uint8_t *start, size_t length);
|
||||
|
||||
// Free the memory associated with a constant pool.
|
||||
void yp_constant_pool_free(yp_constant_pool_t *pool);
|
||||
|
@ -8,7 +8,7 @@
|
||||
void *
|
||||
yp_memchr(const void *memory, int character, size_t number, bool encoding_changed, yp_encoding_t *encoding) {
|
||||
if (encoding_changed && encoding->multibyte && character >= YP_MEMCHR_TRAILING_BYTE_MINIMUM) {
|
||||
const char *source = (const char *) memory;
|
||||
const uint8_t *source = (const uint8_t *) memory;
|
||||
size_t index = 0;
|
||||
|
||||
while (index < number) {
|
||||
|
@ -3,7 +3,7 @@
|
||||
// Initialize a new newline list with the given capacity. Returns true if the
|
||||
// allocation of the offsets succeeds, otherwise returns false.
|
||||
bool
|
||||
yp_newline_list_init(yp_newline_list_t *list, const char *start, size_t capacity) {
|
||||
yp_newline_list_init(yp_newline_list_t *list, const uint8_t *start, size_t capacity) {
|
||||
list->offsets = (size_t *) calloc(capacity, sizeof(size_t));
|
||||
if (list->offsets == NULL) return false;
|
||||
|
||||
@ -23,7 +23,7 @@ yp_newline_list_init(yp_newline_list_t *list, const char *start, size_t capacity
|
||||
// Append a new offset to the newline list. Returns true if the reallocation of
|
||||
// the offsets succeeds (if one was necessary), otherwise returns false.
|
||||
bool
|
||||
yp_newline_list_append(yp_newline_list_t *list, const char *cursor) {
|
||||
yp_newline_list_append(yp_newline_list_t *list, const uint8_t *cursor) {
|
||||
if (list->size == list->capacity) {
|
||||
list->capacity = (list->capacity * 3) / 2;
|
||||
list->offsets = (size_t *) realloc(list->offsets, list->capacity * sizeof(size_t));
|
||||
@ -33,6 +33,7 @@ yp_newline_list_append(yp_newline_list_t *list, const char *cursor) {
|
||||
assert(*cursor == '\n');
|
||||
assert(cursor >= list->start);
|
||||
size_t newline_offset = (size_t) (cursor - list->start + 1);
|
||||
|
||||
assert(list->size == 0 || newline_offset > list->offsets[list->size - 1]);
|
||||
list->offsets[list->size++] = newline_offset;
|
||||
|
||||
@ -41,7 +42,7 @@ yp_newline_list_append(yp_newline_list_t *list, const char *cursor) {
|
||||
|
||||
// Conditionally append a new offset to the newline list, if the value passed in is a newline.
|
||||
bool
|
||||
yp_newline_list_check_append(yp_newline_list_t *list, const char *cursor) {
|
||||
yp_newline_list_check_append(yp_newline_list_t *list, const uint8_t *cursor) {
|
||||
if (*cursor != '\n') {
|
||||
return true;
|
||||
}
|
||||
@ -105,7 +106,7 @@ yp_newline_list_line_column_scan(yp_newline_list_t *list, size_t offset) {
|
||||
// list, the line and column of the closest offset less than the given offset
|
||||
// are returned.
|
||||
yp_line_column_t
|
||||
yp_newline_list_line_column(yp_newline_list_t *list, const char *cursor) {
|
||||
yp_newline_list_line_column(yp_newline_list_t *list, const uint8_t *cursor) {
|
||||
assert(cursor >= list->start);
|
||||
size_t offset = (size_t) (cursor - list->start);
|
||||
yp_line_column_t result;
|
||||
|
@ -19,7 +19,7 @@
|
||||
// A list of offsets of newlines in a string. The offsets are assumed to be
|
||||
// sorted/inserted in ascending order.
|
||||
typedef struct {
|
||||
const char *start;
|
||||
const uint8_t *start;
|
||||
|
||||
size_t *offsets;
|
||||
size_t size;
|
||||
@ -41,19 +41,19 @@ typedef struct {
|
||||
|
||||
// Initialize a new newline list with the given capacity. Returns true if the
|
||||
// allocation of the offsets succeeds, otherwise returns false.
|
||||
bool yp_newline_list_init(yp_newline_list_t *list, const char *start, size_t capacity);
|
||||
bool yp_newline_list_init(yp_newline_list_t *list, const uint8_t *start, size_t capacity);
|
||||
|
||||
// Append a new offset to the newline list. Returns true if the reallocation of
|
||||
// the offsets succeeds (if one was necessary), otherwise returns false.
|
||||
bool yp_newline_list_append(yp_newline_list_t *list, const char *cursor);
|
||||
bool yp_newline_list_append(yp_newline_list_t *list, const uint8_t *cursor);
|
||||
|
||||
// Conditionally append a new offset to the newline list, if the value passed in is a newline.
|
||||
bool yp_newline_list_check_append(yp_newline_list_t *list, const char *cursor);
|
||||
bool yp_newline_list_check_append(yp_newline_list_t *list, const uint8_t *cursor);
|
||||
|
||||
// Returns the line and column of the given offset. If the offset is not in the
|
||||
// list, the line and column of the closest offset less than the given offset
|
||||
// are returned.
|
||||
yp_line_column_t yp_newline_list_line_column(yp_newline_list_t *list, const char *cursor);
|
||||
yp_line_column_t yp_newline_list_line_column(yp_newline_list_t *list, const uint8_t *cursor);
|
||||
|
||||
// Free the internal memory allocated for the newline list.
|
||||
void yp_newline_list_free(yp_newline_list_t *list);
|
||||
|
@ -12,18 +12,19 @@
|
||||
|
||||
// Initialize a shared string that is based on initial input.
|
||||
void
|
||||
yp_string_shared_init(yp_string_t *string, const char *start, const char *end) {
|
||||
yp_string_shared_init(yp_string_t *string, const uint8_t *start, const uint8_t *end) {
|
||||
assert(start <= end);
|
||||
|
||||
*string = (yp_string_t) {
|
||||
.type = YP_STRING_SHARED,
|
||||
.source = (char*) start,
|
||||
.source = start,
|
||||
.length = (size_t) (end - start)
|
||||
};
|
||||
}
|
||||
|
||||
// Initialize an owned string that is responsible for freeing allocated memory.
|
||||
void
|
||||
yp_string_owned_init(yp_string_t *string, char *source, size_t length) {
|
||||
yp_string_owned_init(yp_string_t *string, uint8_t *source, size_t length) {
|
||||
*string = (yp_string_t) {
|
||||
.type = YP_STRING_OWNED,
|
||||
.source = source,
|
||||
@ -36,13 +37,13 @@ void
|
||||
yp_string_constant_init(yp_string_t *string, const char *source, size_t length) {
|
||||
*string = (yp_string_t) {
|
||||
.type = YP_STRING_CONSTANT,
|
||||
.source = (char*) source,
|
||||
.source = (const uint8_t *) source,
|
||||
.length = length
|
||||
};
|
||||
}
|
||||
|
||||
static void
|
||||
yp_string_mapped_init_internal(yp_string_t *string, char *source, size_t length) {
|
||||
yp_string_mapped_init_internal(yp_string_t *string, uint8_t *source, size_t length) {
|
||||
*string = (yp_string_t) {
|
||||
.type = YP_STRING_MAPPED,
|
||||
.source = source,
|
||||
@ -67,13 +68,13 @@ yp_string_ensure_owned(yp_string_t *string) {
|
||||
if (string->type == YP_STRING_OWNED) return;
|
||||
|
||||
size_t length = yp_string_length(string);
|
||||
const char *source = yp_string_source(string);
|
||||
const uint8_t *source = yp_string_source(string);
|
||||
|
||||
char *memory = malloc(length);
|
||||
uint8_t *memory = malloc(length);
|
||||
if (!memory) return;
|
||||
|
||||
yp_string_owned_init(string, memory, length);
|
||||
memcpy(string->source, source, length);
|
||||
memcpy((void *) string->source, source, length);
|
||||
}
|
||||
|
||||
// Returns the length associated with the string.
|
||||
@ -83,7 +84,7 @@ yp_string_length(const yp_string_t *string) {
|
||||
}
|
||||
|
||||
// Returns the start pointer associated with the string.
|
||||
YP_EXPORTED_FUNCTION const char *
|
||||
YP_EXPORTED_FUNCTION const uint8_t *
|
||||
yp_string_source(const yp_string_t *string) {
|
||||
return string->source;
|
||||
}
|
||||
@ -91,15 +92,16 @@ yp_string_source(const yp_string_t *string) {
|
||||
// Free the associated memory of the given string.
|
||||
YP_EXPORTED_FUNCTION void
|
||||
yp_string_free(yp_string_t *string) {
|
||||
void *memory = (void *) string->source;
|
||||
|
||||
if (string->type == YP_STRING_OWNED) {
|
||||
free(string->source);
|
||||
free(memory);
|
||||
} else if (string->type == YP_STRING_MAPPED && string->length) {
|
||||
void *memory = (void *) string->source;
|
||||
#if defined(_WIN32)
|
||||
#if defined(_WIN32)
|
||||
UnmapViewOfFile(memory);
|
||||
#else
|
||||
#else
|
||||
munmap(memory, string->length);
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
@ -126,8 +128,8 @@ yp_string_mapped_init(yp_string_t *string, const char *filepath) {
|
||||
// the source to a constant empty string and return.
|
||||
if (file_size == 0) {
|
||||
CloseHandle(file);
|
||||
char empty_string[] = "";
|
||||
yp_string_mapped_init_internal(string, empty_string, 0);
|
||||
uint8_t empty[] = "";
|
||||
yp_string_mapped_init_internal(string, empty, 0);
|
||||
return true;
|
||||
}
|
||||
|
||||
@ -140,7 +142,7 @@ yp_string_mapped_init(yp_string_t *string, const char *filepath) {
|
||||
}
|
||||
|
||||
// Map the file into memory.
|
||||
char *source = (char *) MapViewOfFile(mapping, FILE_MAP_READ, 0, 0, 0);
|
||||
uint8_t *source = (uint8_t *) MapViewOfFile(mapping, FILE_MAP_READ, 0, 0, 0);
|
||||
CloseHandle(mapping);
|
||||
CloseHandle(file);
|
||||
|
||||
@ -169,12 +171,12 @@ yp_string_mapped_init(yp_string_t *string, const char *filepath) {
|
||||
|
||||
// mmap the file descriptor to virtually get the contents
|
||||
size_t size = (size_t) sb.st_size;
|
||||
char *source = NULL;
|
||||
uint8_t *source = NULL;
|
||||
|
||||
if (size == 0) {
|
||||
close(fd);
|
||||
char empty_string[] = "";
|
||||
yp_string_mapped_init_internal(string, empty_string, 0);
|
||||
uint8_t empty[] = "";
|
||||
yp_string_mapped_init_internal(string, empty, 0);
|
||||
return true;
|
||||
}
|
||||
|
||||
|
@ -12,17 +12,17 @@
|
||||
// This struct represents a string value.
|
||||
typedef struct {
|
||||
enum { YP_STRING_SHARED, YP_STRING_OWNED, YP_STRING_CONSTANT, YP_STRING_MAPPED } type;
|
||||
char *source;
|
||||
const uint8_t *source;
|
||||
size_t length;
|
||||
} yp_string_t;
|
||||
|
||||
#define YP_EMPTY_STRING ((yp_string_t) { .type = YP_STRING_CONSTANT, .source = NULL, .length = 0 })
|
||||
|
||||
// Initialize a shared string that is based on initial input.
|
||||
void yp_string_shared_init(yp_string_t *string, const char *start, const char *end);
|
||||
void yp_string_shared_init(yp_string_t *string, const uint8_t *start, const uint8_t *end);
|
||||
|
||||
// Initialize an owned string that is responsible for freeing allocated memory.
|
||||
void yp_string_owned_init(yp_string_t *string, char *source, size_t length);
|
||||
void yp_string_owned_init(yp_string_t *string, uint8_t *source, size_t length);
|
||||
|
||||
// Initialize a constant string that doesn't own its memory source.
|
||||
void yp_string_constant_init(yp_string_t *string, const char *source, size_t length);
|
||||
@ -49,7 +49,7 @@ void yp_string_ensure_owned(yp_string_t *string);
|
||||
YP_EXPORTED_FUNCTION size_t yp_string_length(const yp_string_t *string);
|
||||
|
||||
// Returns the start pointer associated with the string.
|
||||
YP_EXPORTED_FUNCTION const char * yp_string_source(const yp_string_t *string);
|
||||
YP_EXPORTED_FUNCTION const uint8_t * yp_string_source(const yp_string_t *string);
|
||||
|
||||
// Free the associated memory of the given string.
|
||||
YP_EXPORTED_FUNCTION void yp_string_free(yp_string_t *string);
|
||||
|
@ -1,11 +1,5 @@
|
||||
#include "yarp/util/yp_string_list.h"
|
||||
|
||||
// Allocate a new yp_string_list_t.
|
||||
yp_string_list_t *
|
||||
yp_string_list_alloc(void) {
|
||||
return (yp_string_list_t *) malloc(sizeof(yp_string_list_t));
|
||||
}
|
||||
|
||||
// Initialize a yp_string_list_t with its default values.
|
||||
void
|
||||
yp_string_list_init(yp_string_list_t *string_list) {
|
||||
|
@ -13,9 +13,6 @@ typedef struct {
|
||||
size_t capacity;
|
||||
} yp_string_list_t;
|
||||
|
||||
// Allocate a new yp_string_list_t.
|
||||
yp_string_list_t * yp_string_list_alloc(void);
|
||||
|
||||
// Initialize a yp_string_list_t with its default values.
|
||||
YP_EXPORTED_FUNCTION void yp_string_list_init(yp_string_list_t *string_list);
|
||||
|
||||
|
@ -1,18 +1,15 @@
|
||||
#include <ctype.h>
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
|
||||
int
|
||||
yp_strncasecmp(const char *string1, const char *string2, size_t length) {
|
||||
yp_strncasecmp(const uint8_t *string1, const uint8_t *string2, size_t length) {
|
||||
size_t offset = 0;
|
||||
int difference = 0;
|
||||
|
||||
while (offset < length && string1[offset] != '\0') {
|
||||
if (string2[offset] == '\0') return string1[offset];
|
||||
|
||||
unsigned char left = (unsigned char) string1[offset];
|
||||
unsigned char right = (unsigned char) string2[offset];
|
||||
|
||||
if ((difference = tolower(left) - tolower(right)) != 0) return difference;
|
||||
if ((difference = tolower(string1[offset]) - tolower(string2[offset])) != 0) return difference;
|
||||
offset++;
|
||||
}
|
||||
|
||||
|
@ -1,12 +1,12 @@
|
||||
#include "yarp/util/yp_strpbrk.h"
|
||||
|
||||
// This is the slow path that does care about the encoding.
|
||||
static inline const char *
|
||||
yp_strpbrk_multi_byte(yp_parser_t *parser, const char *source, const char *charset, size_t maximum) {
|
||||
static inline const uint8_t *
|
||||
yp_strpbrk_multi_byte(yp_parser_t *parser, const uint8_t *source, const uint8_t *charset, size_t maximum) {
|
||||
size_t index = 0;
|
||||
|
||||
while (index < maximum) {
|
||||
if (strchr(charset, source[index]) != NULL) {
|
||||
if (strchr((const char *) charset, source[index]) != NULL) {
|
||||
return source + index;
|
||||
}
|
||||
|
||||
@ -22,12 +22,12 @@ yp_strpbrk_multi_byte(yp_parser_t *parser, const char *source, const char *chars
|
||||
}
|
||||
|
||||
// This is the fast path that does not care about the encoding.
|
||||
static inline const char *
|
||||
yp_strpbrk_single_byte(const char *source, const char *charset, size_t maximum) {
|
||||
static inline const uint8_t *
|
||||
yp_strpbrk_single_byte(const uint8_t *source, const uint8_t *charset, size_t maximum) {
|
||||
size_t index = 0;
|
||||
|
||||
while (index < maximum) {
|
||||
if (strchr(charset, source[index]) != NULL) {
|
||||
if (strchr((const char *) charset, source[index]) != NULL) {
|
||||
return source + index;
|
||||
}
|
||||
|
||||
@ -54,8 +54,8 @@ yp_strpbrk_single_byte(const char *source, const char *charset, size_t maximum)
|
||||
// characters that are trailing bytes of multi-byte characters. For example, in
|
||||
// Shift-JIS, the backslash character can be a trailing byte. In that case we
|
||||
// need to take a slower path and iterate one multi-byte character at a time.
|
||||
const char *
|
||||
yp_strpbrk(yp_parser_t *parser, const char *source, const char *charset, ptrdiff_t length) {
|
||||
const uint8_t *
|
||||
yp_strpbrk(yp_parser_t *parser, const uint8_t *source, const uint8_t *charset, ptrdiff_t length) {
|
||||
if (length <= 0) {
|
||||
return NULL;
|
||||
} else if (parser->encoding_changed && parser->encoding.multibyte) {
|
||||
|
@ -24,6 +24,6 @@
|
||||
// characters that are trailing bytes of multi-byte characters. For example, in
|
||||
// Shift-JIS, the backslash character can be a trailing byte. In that case we
|
||||
// need to take a slower path and iterate one multi-byte character at a time.
|
||||
const char * yp_strpbrk(yp_parser_t *parser, const char *source, const char *charset, ptrdiff_t length);
|
||||
const uint8_t * yp_strpbrk(yp_parser_t *parser, const uint8_t *source, const uint8_t *charset, ptrdiff_t length);
|
||||
|
||||
#endif
|
||||
|
216
yarp/yarp.c
216
yarp/yarp.c
@ -167,8 +167,8 @@ debug_token(yp_token_t * token) {
|
||||
|
||||
// Returns the incrementor character that should be used to increment the
|
||||
// nesting count if one is possible.
|
||||
static inline char
|
||||
lex_mode_incrementor(const char start) {
|
||||
static inline uint8_t
|
||||
lex_mode_incrementor(const uint8_t start) {
|
||||
switch (start) {
|
||||
case '(':
|
||||
case '[':
|
||||
@ -182,8 +182,8 @@ lex_mode_incrementor(const char start) {
|
||||
|
||||
// Returns the matching character that should be used to terminate a list
|
||||
// beginning with the given character.
|
||||
static inline char
|
||||
lex_mode_terminator(const char start) {
|
||||
static inline uint8_t
|
||||
lex_mode_terminator(const uint8_t start) {
|
||||
switch (start) {
|
||||
case '(':
|
||||
return ')';
|
||||
@ -221,9 +221,9 @@ lex_mode_push(yp_parser_t *parser, yp_lex_mode_t lex_mode) {
|
||||
|
||||
// Push on a new list lex mode.
|
||||
static inline bool
|
||||
lex_mode_push_list(yp_parser_t *parser, bool interpolation, char delimiter) {
|
||||
char incrementor = lex_mode_incrementor(delimiter);
|
||||
char terminator = lex_mode_terminator(delimiter);
|
||||
lex_mode_push_list(yp_parser_t *parser, bool interpolation, uint8_t delimiter) {
|
||||
uint8_t incrementor = lex_mode_incrementor(delimiter);
|
||||
uint8_t terminator = lex_mode_terminator(delimiter);
|
||||
|
||||
yp_lex_mode_t lex_mode = {
|
||||
.mode = YP_LEX_LIST,
|
||||
@ -237,7 +237,7 @@ lex_mode_push_list(yp_parser_t *parser, bool interpolation, char delimiter) {
|
||||
|
||||
// These are the places where we need to split up the content of the list.
|
||||
// We'll use strpbrk to find the first of these characters.
|
||||
char *breakpoints = lex_mode.as.list.breakpoints;
|
||||
uint8_t *breakpoints = lex_mode.as.list.breakpoints;
|
||||
memcpy(breakpoints, "\\ \t\f\r\v\n\0\0\0", sizeof(lex_mode.as.list.breakpoints));
|
||||
|
||||
// Now we'll add the terminator to the list of breakpoints.
|
||||
@ -260,7 +260,7 @@ lex_mode_push_list(yp_parser_t *parser, bool interpolation, char delimiter) {
|
||||
|
||||
// Push on a new regexp lex mode.
|
||||
static inline bool
|
||||
lex_mode_push_regexp(yp_parser_t *parser, char incrementor, char terminator) {
|
||||
lex_mode_push_regexp(yp_parser_t *parser, uint8_t incrementor, uint8_t terminator) {
|
||||
yp_lex_mode_t lex_mode = {
|
||||
.mode = YP_LEX_REGEXP,
|
||||
.as.regexp = {
|
||||
@ -273,7 +273,7 @@ lex_mode_push_regexp(yp_parser_t *parser, char incrementor, char terminator) {
|
||||
// These are the places where we need to split up the content of the
|
||||
// regular expression. We'll use strpbrk to find the first of these
|
||||
// characters.
|
||||
char *breakpoints = lex_mode.as.regexp.breakpoints;
|
||||
uint8_t *breakpoints = lex_mode.as.regexp.breakpoints;
|
||||
memcpy(breakpoints, "\n\\#\0\0", sizeof(lex_mode.as.regexp.breakpoints));
|
||||
|
||||
// First we'll add the terminator.
|
||||
@ -289,7 +289,7 @@ lex_mode_push_regexp(yp_parser_t *parser, char incrementor, char terminator) {
|
||||
|
||||
// Push on a new string lex mode.
|
||||
static inline bool
|
||||
lex_mode_push_string(yp_parser_t *parser, bool interpolation, bool label_allowed, char incrementor, char terminator) {
|
||||
lex_mode_push_string(yp_parser_t *parser, bool interpolation, bool label_allowed, uint8_t incrementor, uint8_t terminator) {
|
||||
yp_lex_mode_t lex_mode = {
|
||||
.mode = YP_LEX_STRING,
|
||||
.as.string = {
|
||||
@ -303,7 +303,7 @@ lex_mode_push_string(yp_parser_t *parser, bool interpolation, bool label_allowed
|
||||
|
||||
// These are the places where we need to split up the content of the
|
||||
// string. We'll use strpbrk to find the first of these characters.
|
||||
char *breakpoints = lex_mode.as.string.breakpoints;
|
||||
uint8_t *breakpoints = lex_mode.as.string.breakpoints;
|
||||
memcpy(breakpoints, "\n\\\0\0\0", sizeof(lex_mode.as.string.breakpoints));
|
||||
|
||||
// Now add in the terminator.
|
||||
@ -423,7 +423,7 @@ debug_lex_state_set(yp_parser_t *parser, yp_lex_state_t state, char const * call
|
||||
|
||||
// Retrieve the constant pool id for the given location.
|
||||
static inline yp_constant_id_t
|
||||
yp_parser_constant_id_location(yp_parser_t *parser, const char *start, const char *end) {
|
||||
yp_parser_constant_id_location(yp_parser_t *parser, const uint8_t *start, const uint8_t *end) {
|
||||
return yp_constant_pool_insert(&parser->constant_pool, start, (size_t) (end - start));
|
||||
}
|
||||
|
||||
@ -615,7 +615,7 @@ yp_regular_expression_flags_create(const yp_token_t *closing) {
|
||||
yp_node_flags_t flags = 0;
|
||||
|
||||
if (closing->type == YP_TOKEN_REGEXP_END) {
|
||||
for (const char *flag = closing->start + 1; flag < closing->end; flag++) {
|
||||
for (const uint8_t *flag = closing->start + 1; flag < closing->end; flag++) {
|
||||
switch (*flag) {
|
||||
case 'i': flags |= YP_REGULAR_EXPRESSION_FLAGS_IGNORE_CASE; break;
|
||||
case 'm': flags |= YP_REGULAR_EXPRESSION_FLAGS_MULTI_LINE; break;
|
||||
@ -657,7 +657,7 @@ yp_alloc_node(YP_ATTRIBUTE_UNUSED yp_parser_t *parser, size_t size) {
|
||||
|
||||
// Allocate a new MissingNode node.
|
||||
static yp_missing_node_t *
|
||||
yp_missing_node_create(yp_parser_t *parser, const char *start, const char *end) {
|
||||
yp_missing_node_create(yp_parser_t *parser, const uint8_t *start, const uint8_t *end) {
|
||||
yp_missing_node_t *node = YP_ALLOC_NODE(parser, yp_missing_node_t);
|
||||
*node = (yp_missing_node_t) {{ .type = YP_NODE_MISSING_NODE, .location = { .start = start, .end = end } }};
|
||||
return node;
|
||||
@ -926,7 +926,7 @@ yp_array_pattern_node_requireds_append(yp_array_pattern_node_t *node, yp_node_t
|
||||
static yp_assoc_node_t *
|
||||
yp_assoc_node_create(yp_parser_t *parser, yp_node_t *key, const yp_token_t *operator, yp_node_t *value) {
|
||||
yp_assoc_node_t *node = YP_ALLOC_NODE(parser, yp_assoc_node_t);
|
||||
const char *end;
|
||||
const uint8_t *end;
|
||||
|
||||
if (value != NULL) {
|
||||
end = value->location.end;
|
||||
@ -1110,7 +1110,7 @@ static yp_block_parameters_node_t *
|
||||
yp_block_parameters_node_create(yp_parser_t *parser, yp_parameters_node_t *parameters, const yp_token_t *opening) {
|
||||
yp_block_parameters_node_t *node = YP_ALLOC_NODE(parser, yp_block_parameters_node_t);
|
||||
|
||||
const char *start;
|
||||
const uint8_t *start;
|
||||
if (opening->type != YP_TOKEN_NOT_PROVIDED) {
|
||||
start = opening->start;
|
||||
} else if (parameters != NULL) {
|
||||
@ -1119,7 +1119,7 @@ yp_block_parameters_node_create(yp_parser_t *parser, yp_parameters_node_t *param
|
||||
start = NULL;
|
||||
}
|
||||
|
||||
const char *end;
|
||||
const uint8_t *end;
|
||||
if (parameters != NULL) {
|
||||
end = parameters->base.location.end;
|
||||
} else if (opening->type != YP_TOKEN_NOT_PROVIDED) {
|
||||
@ -1878,7 +1878,7 @@ yp_def_node_create(
|
||||
const yp_token_t *end_keyword
|
||||
) {
|
||||
yp_def_node_t *node = YP_ALLOC_NODE(parser, yp_def_node_t);
|
||||
const char *end;
|
||||
const uint8_t *end;
|
||||
|
||||
if (end_keyword->type == YP_TOKEN_NOT_PROVIDED) {
|
||||
end = body->location.end;
|
||||
@ -1933,7 +1933,7 @@ yp_defined_node_create(yp_parser_t *parser, const yp_token_t *lparen, yp_node_t
|
||||
static yp_else_node_t *
|
||||
yp_else_node_create(yp_parser_t *parser, const yp_token_t *else_keyword, yp_statements_node_t *statements, const yp_token_t *end_keyword) {
|
||||
yp_else_node_t *node = YP_ALLOC_NODE(parser, yp_else_node_t);
|
||||
const char *end = NULL;
|
||||
const uint8_t *end = NULL;
|
||||
if ((end_keyword->type == YP_TOKEN_NOT_PROVIDED) && (statements != NULL)) {
|
||||
end = statements->base.location.end;
|
||||
} else {
|
||||
@ -2413,7 +2413,7 @@ yp_if_node_create(yp_parser_t *parser,
|
||||
yp_flip_flop(predicate);
|
||||
yp_if_node_t *node = YP_ALLOC_NODE(parser, yp_if_node_t);
|
||||
|
||||
const char *end;
|
||||
const uint8_t *end;
|
||||
if (end_keyword->type != YP_TOKEN_NOT_PROVIDED) {
|
||||
end = end_keyword->end;
|
||||
} else if (consequent != NULL) {
|
||||
@ -2596,7 +2596,7 @@ static yp_in_node_t *
|
||||
yp_in_node_create(yp_parser_t *parser, yp_node_t *pattern, yp_statements_node_t *statements, const yp_token_t *in_keyword, const yp_token_t *then_keyword) {
|
||||
yp_in_node_t *node = YP_ALLOC_NODE(parser, yp_in_node_t);
|
||||
|
||||
const char *end;
|
||||
const uint8_t *end;
|
||||
if (statements != NULL) {
|
||||
end = statements->base.location.end;
|
||||
} else if (then_keyword->type != YP_TOKEN_NOT_PROVIDED) {
|
||||
@ -3891,7 +3891,7 @@ yp_statements_node_body_length(yp_statements_node_t *node) {
|
||||
|
||||
// Set the location of the given StatementsNode.
|
||||
static void
|
||||
yp_statements_node_location_set(yp_statements_node_t *node, const char *start, const char *end) {
|
||||
yp_statements_node_location_set(yp_statements_node_t *node, const uint8_t *start, const uint8_t *end) {
|
||||
node->base.location = (yp_location_t) { .start = start, .end = end };
|
||||
}
|
||||
|
||||
@ -3957,7 +3957,7 @@ yp_super_node_create(yp_parser_t *parser, const yp_token_t *keyword, yp_argument
|
||||
assert(keyword->type == YP_TOKEN_KEYWORD_SUPER);
|
||||
yp_super_node_t *node = YP_ALLOC_NODE(parser, yp_super_node_t);
|
||||
|
||||
const char *end;
|
||||
const uint8_t *end;
|
||||
if (arguments->block != NULL) {
|
||||
end = arguments->block->base.location.end;
|
||||
} else if (arguments->closing_loc.start != NULL) {
|
||||
@ -4048,7 +4048,7 @@ yp_symbol_node_label_create(yp_parser_t *parser, const yp_token_t *token) {
|
||||
// Check if the given node is a label in a hash.
|
||||
static bool
|
||||
yp_symbol_node_label_p(yp_node_t *node) {
|
||||
const char *end = NULL;
|
||||
const uint8_t *end = NULL;
|
||||
|
||||
switch (YP_NODE_TYPE(node)) {
|
||||
case YP_NODE_SYMBOL_NODE:
|
||||
@ -4156,7 +4156,7 @@ yp_unless_node_create(yp_parser_t *parser, const yp_token_t *keyword, yp_node_t
|
||||
yp_flip_flop(predicate);
|
||||
yp_unless_node_t *node = YP_ALLOC_NODE(parser, yp_unless_node_t);
|
||||
|
||||
const char *end;
|
||||
const uint8_t *end;
|
||||
if (statements != NULL) {
|
||||
end = statements->base.location.end;
|
||||
} else {
|
||||
@ -4373,7 +4373,7 @@ static yp_yield_node_t *
|
||||
yp_yield_node_create(yp_parser_t *parser, const yp_token_t *keyword, const yp_location_t *lparen_loc, yp_arguments_node_t *arguments, const yp_location_t *rparen_loc) {
|
||||
yp_yield_node_t *node = YP_ALLOC_NODE(parser, yp_yield_node_t);
|
||||
|
||||
const char *end;
|
||||
const uint8_t *end;
|
||||
if (rparen_loc->start != NULL) {
|
||||
end = rparen_loc->end;
|
||||
} else if (arguments != NULL) {
|
||||
@ -4447,7 +4447,7 @@ yp_parser_local_depth(yp_parser_t *parser, yp_token_t *token) {
|
||||
|
||||
// Add a local variable from a location to the current scope.
|
||||
static yp_constant_id_t
|
||||
yp_parser_local_add_location(yp_parser_t *parser, const char *start, const char *end) {
|
||||
yp_parser_local_add_location(yp_parser_t *parser, const uint8_t *start, const uint8_t *end) {
|
||||
yp_constant_id_t constant_id = yp_parser_constant_id_location(parser, start, end);
|
||||
|
||||
if (!yp_constant_id_list_includes(&parser->current_scope->locals, constant_id)) {
|
||||
@ -4496,15 +4496,13 @@ yp_parser_scope_pop(yp_parser_t *parser) {
|
||||
// reason we have the encoding_changed boolean to check if we need to go through
|
||||
// the function pointer or can just directly use the UTF-8 functions.
|
||||
static inline size_t
|
||||
char_is_identifier_start(yp_parser_t *parser, const char *c) {
|
||||
const unsigned char uc = (unsigned char) *c;
|
||||
|
||||
char_is_identifier_start(yp_parser_t *parser, const uint8_t *b) {
|
||||
if (parser->encoding_changed) {
|
||||
return parser->encoding.alpha_char(c, parser->end - c) || (uc == '_') || (uc >= 0x80);
|
||||
} else if (uc < 0x80) {
|
||||
return (yp_encoding_unicode_table[uc] & YP_ENCODING_ALPHABETIC_BIT ? 1 : 0) || (uc == '_');
|
||||
return parser->encoding.alpha_char(b, parser->end - b) || (*b == '_') || (*b >= 0x80);
|
||||
} else if (*b < 0x80) {
|
||||
return (yp_encoding_unicode_table[*b] & YP_ENCODING_ALPHABETIC_BIT ? 1 : 0) || (*b == '_');
|
||||
} else {
|
||||
return (size_t) (yp_encoding_utf_8_alpha_char(c, parser->end - c) || 1u);
|
||||
return (size_t) (yp_encoding_utf_8_alpha_char(b, parser->end - b) || 1u);
|
||||
}
|
||||
}
|
||||
|
||||
@ -4512,15 +4510,13 @@ char_is_identifier_start(yp_parser_t *parser, const char *c) {
|
||||
// the identifiers in a source file once the first character has been found. So
|
||||
// it's important that it be as fast as possible.
|
||||
static inline size_t
|
||||
char_is_identifier(yp_parser_t *parser, const char *c) {
|
||||
const unsigned char uc = (unsigned char) *c;
|
||||
|
||||
char_is_identifier(yp_parser_t *parser, const uint8_t *b) {
|
||||
if (parser->encoding_changed) {
|
||||
return parser->encoding.alnum_char(c, parser->end - c) || (uc == '_') || (uc >= 0x80);
|
||||
} else if (uc < 0x80) {
|
||||
return (yp_encoding_unicode_table[uc] & YP_ENCODING_ALPHANUMERIC_BIT ? 1 : 0) || (uc == '_');
|
||||
return parser->encoding.alnum_char(b, parser->end - b) || (*b == '_') || (*b >= 0x80);
|
||||
} else if (*b < 0x80) {
|
||||
return (yp_encoding_unicode_table[*b] & YP_ENCODING_ALPHANUMERIC_BIT ? 1 : 0) || (*b == '_');
|
||||
} else {
|
||||
return (size_t) (yp_encoding_utf_8_alnum_char(c, parser->end - c) || 1u);
|
||||
return (size_t) (yp_encoding_utf_8_alnum_char(b, parser->end - b) || 1u);
|
||||
}
|
||||
}
|
||||
|
||||
@ -4542,15 +4538,15 @@ const unsigned int yp_global_name_punctuation_hash[(0x7e - 0x20 + 31) / 32] = {
|
||||
#undef PUNCT
|
||||
|
||||
static inline bool
|
||||
char_is_global_name_punctuation(const char c) {
|
||||
const unsigned int i = (const unsigned int) c;
|
||||
char_is_global_name_punctuation(const uint8_t b) {
|
||||
const unsigned int i = (const unsigned int) b;
|
||||
if (i <= 0x20 || 0x7e < i) return false;
|
||||
|
||||
return (yp_global_name_punctuation_hash[(i - 0x20) / 32] >> (c % 32)) & 1;
|
||||
return (yp_global_name_punctuation_hash[(i - 0x20) / 32] >> (i % 32)) & 1;
|
||||
}
|
||||
|
||||
static inline bool
|
||||
token_is_numbered_parameter(const char *start, const char *end) {
|
||||
token_is_numbered_parameter(const uint8_t *start, const uint8_t *end) {
|
||||
return (end - start == 2) && (start[0] == '_') && (start[1] != '0') && (yp_char_is_decimal_digit(start[1]));
|
||||
}
|
||||
|
||||
@ -4604,8 +4600,8 @@ yp_do_loop_stack_p(yp_parser_t *parser) {
|
||||
|
||||
// Get the next character in the source starting from +cursor+. If that position
|
||||
// is beyond the end of the source then return '\0'.
|
||||
static inline char
|
||||
peek_at(yp_parser_t *parser, const char *cursor) {
|
||||
static inline uint8_t
|
||||
peek_at(yp_parser_t *parser, const uint8_t *cursor) {
|
||||
if (cursor < parser->end) {
|
||||
return *cursor;
|
||||
} else {
|
||||
@ -4616,33 +4612,33 @@ peek_at(yp_parser_t *parser, const char *cursor) {
|
||||
// Get the next character in the source starting from parser->current.end and
|
||||
// adding the given offset. If that position is beyond the end of the source
|
||||
// then return '\0'.
|
||||
static inline char
|
||||
static inline uint8_t
|
||||
peek_offset(yp_parser_t *parser, ptrdiff_t offset) {
|
||||
return peek_at(parser, parser->current.end + offset);
|
||||
}
|
||||
|
||||
// Get the next character in the source starting from parser->current.end. If
|
||||
// that position is beyond the end of the source then return '\0'.
|
||||
static inline char
|
||||
static inline uint8_t
|
||||
peek(yp_parser_t *parser) {
|
||||
return peek_at(parser, parser->current.end);
|
||||
}
|
||||
|
||||
// Get the next string of length len in the source starting from parser->current.end.
|
||||
// If the string extends beyond the end of the source, return the empty string ""
|
||||
static inline const char*
|
||||
static inline const uint8_t *
|
||||
peek_string(yp_parser_t *parser, size_t len) {
|
||||
if (parser->current.end + len <= parser->end) {
|
||||
return parser->current.end;
|
||||
} else {
|
||||
return "";
|
||||
return (const uint8_t *) "";
|
||||
}
|
||||
}
|
||||
|
||||
// If the character to be read matches the given value, then returns true and
|
||||
// advanced the current pointer.
|
||||
static inline bool
|
||||
match(yp_parser_t *parser, char value) {
|
||||
match(yp_parser_t *parser, uint8_t value) {
|
||||
if (peek(parser) == value) {
|
||||
parser->current.end++;
|
||||
return true;
|
||||
@ -4653,7 +4649,7 @@ match(yp_parser_t *parser, char value) {
|
||||
// Return the length of the line ending string starting at +cursor+, or 0 if it
|
||||
// is not a line ending. This function is intended to be CRLF/LF agnostic.
|
||||
static inline size_t
|
||||
match_eol_at(yp_parser_t *parser, const char *cursor) {
|
||||
match_eol_at(yp_parser_t *parser, const uint8_t *cursor) {
|
||||
if (peek_at(parser, cursor) == '\n') {
|
||||
return 1;
|
||||
}
|
||||
@ -4680,8 +4676,8 @@ match_eol(yp_parser_t *parser) {
|
||||
}
|
||||
|
||||
// Skip to the next newline character or NUL byte.
|
||||
static inline const char *
|
||||
next_newline(const char *cursor, ptrdiff_t length) {
|
||||
static inline const uint8_t *
|
||||
next_newline(const uint8_t *cursor, ptrdiff_t length) {
|
||||
assert(length >= 0);
|
||||
|
||||
// Note that it's okay for us to use memchr here to look for \n because none
|
||||
@ -4692,15 +4688,15 @@ next_newline(const char *cursor, ptrdiff_t length) {
|
||||
|
||||
// Find the start of the encoding comment. This is effectively an inlined
|
||||
// version of strnstr with some modifications.
|
||||
static inline const char *
|
||||
parser_lex_encoding_comment_start(yp_parser_t *parser, const char *cursor, ptrdiff_t remaining) {
|
||||
static inline const uint8_t *
|
||||
parser_lex_encoding_comment_start(yp_parser_t *parser, const uint8_t *cursor, ptrdiff_t remaining) {
|
||||
assert(remaining >= 0);
|
||||
size_t length = (size_t) remaining;
|
||||
|
||||
size_t key_length = strlen("coding:");
|
||||
if (key_length > length) return NULL;
|
||||
|
||||
const char *cursor_limit = cursor + length - key_length + 1;
|
||||
const uint8_t *cursor_limit = cursor + length - key_length + 1;
|
||||
while ((cursor = yp_memchr(cursor, 'c', (size_t) (cursor_limit - cursor), parser->encoding_changed, &parser->encoding)) != NULL) {
|
||||
if (memcmp(cursor, "coding", key_length - 1) == 0) {
|
||||
size_t whitespace_after_coding = yp_strspn_inline_whitespace(cursor + key_length - 1, parser->end - (cursor + key_length - 1));
|
||||
@ -4721,13 +4717,13 @@ parser_lex_encoding_comment_start(yp_parser_t *parser, const char *cursor, ptrdi
|
||||
// actions are necessary for it here.
|
||||
static void
|
||||
parser_lex_encoding_comment(yp_parser_t *parser) {
|
||||
const char *start = parser->current.start + 1;
|
||||
const char *end = next_newline(start, parser->end - start);
|
||||
const uint8_t *start = parser->current.start + 1;
|
||||
const uint8_t *end = next_newline(start, parser->end - start);
|
||||
if (end == NULL) end = parser->end;
|
||||
|
||||
// These are the patterns we're going to match to find the encoding comment.
|
||||
// This is definitely not complete or even really correct.
|
||||
const char *encoding_start = parser_lex_encoding_comment_start(parser, start, end - start);
|
||||
const uint8_t *encoding_start = parser_lex_encoding_comment_start(parser, start, end - start);
|
||||
|
||||
// If we didn't find anything that matched our patterns, then return. Note
|
||||
// that this does a _very_ poor job of actually finding the encoding, and
|
||||
@ -4740,7 +4736,7 @@ parser_lex_encoding_comment(yp_parser_t *parser) {
|
||||
|
||||
// Now determine the end of the encoding string. This is either the end of
|
||||
// the line, the first whitespace character, or a punctuation mark.
|
||||
const char *encoding_end = yp_strpbrk(parser, encoding_start, " \t\f\r\v\n;,", end - encoding_start);
|
||||
const uint8_t *encoding_end = yp_strpbrk(parser, encoding_start, (const uint8_t *) " \t\f\r\v\n;,", end - encoding_start);
|
||||
encoding_end = encoding_end == NULL ? end : encoding_end;
|
||||
|
||||
// Finally, we can determine the width of the encoding string.
|
||||
@ -4762,7 +4758,7 @@ parser_lex_encoding_comment(yp_parser_t *parser) {
|
||||
// Extensions like utf-8 can contain extra encoding details like,
|
||||
// utf-8-dos, utf-8-linux, utf-8-mac. We treat these all as utf-8 should
|
||||
// treat any encoding starting utf-8 as utf-8.
|
||||
if ((encoding_start + 5 <= parser->end) && (yp_strncasecmp(encoding_start, "utf-8", 5) == 0)) {
|
||||
if ((encoding_start + 5 <= parser->end) && (yp_strncasecmp(encoding_start, (const uint8_t *) "utf-8", 5) == 0)) {
|
||||
// We don't need to do anything here because the default encoding is
|
||||
// already UTF-8. We'll just return.
|
||||
return;
|
||||
@ -4771,7 +4767,7 @@ parser_lex_encoding_comment(yp_parser_t *parser) {
|
||||
// Next, we're going to loop through each of the encodings that we handle
|
||||
// explicitly. If we found one that we understand, we'll use that value.
|
||||
#define ENCODING(value, prebuilt) \
|
||||
if (width == sizeof(value) - 1 && encoding_start + width <= parser->end && yp_strncasecmp(encoding_start, value, width) == 0) { \
|
||||
if (width == sizeof(value) - 1 && encoding_start + width <= parser->end && yp_strncasecmp(encoding_start, (const uint8_t *) value, width) == 0) { \
|
||||
parser->encoding = prebuilt; \
|
||||
parser->encoding_changed |= true; \
|
||||
if (parser->encoding_changed_callback != NULL) parser->encoding_changed_callback(parser); \
|
||||
@ -5093,7 +5089,7 @@ lex_numeric(yp_parser_t *parser) {
|
||||
if (parser->current.end < parser->end) {
|
||||
type = lex_numeric_prefix(parser);
|
||||
|
||||
const char *end = parser->current.end;
|
||||
const uint8_t *end = parser->current.end;
|
||||
yp_token_type_t suffix_type = type;
|
||||
|
||||
if (type == YP_TOKEN_INTEGER) {
|
||||
@ -5118,8 +5114,8 @@ lex_numeric(yp_parser_t *parser) {
|
||||
}
|
||||
}
|
||||
|
||||
const unsigned char uc = (const unsigned char) peek(parser);
|
||||
if (uc != '\0' && (uc >= 0x80 || ((uc >= 'a' && uc <= 'z') || (uc >= 'A' && uc <= 'Z')) || uc == '_')) {
|
||||
const uint8_t b = peek(parser);
|
||||
if (b != '\0' && (b >= 0x80 || ((b >= 'a' && b <= 'z') || (b >= 'A' && b <= 'Z')) || b == '_')) {
|
||||
parser->current.end = end;
|
||||
} else {
|
||||
type = suffix_type;
|
||||
@ -5390,7 +5386,7 @@ current_token_starts_line(yp_parser_t *parser) {
|
||||
// this token type.
|
||||
//
|
||||
static yp_token_type_t
|
||||
lex_interpolation(yp_parser_t *parser, const char *pound) {
|
||||
lex_interpolation(yp_parser_t *parser, const uint8_t *pound) {
|
||||
// If there is no content following this #, then we're at the end of
|
||||
// the string and we can safely return string content.
|
||||
if (pound + 1 >= parser->end) {
|
||||
@ -5411,7 +5407,7 @@ lex_interpolation(yp_parser_t *parser, const char *pound) {
|
||||
|
||||
// If we're looking at a @ and there's another @, then we'll skip past the
|
||||
// second @.
|
||||
const char *variable = pound + 2;
|
||||
const uint8_t *variable = pound + 2;
|
||||
if (*variable == '@' && pound + 3 < parser->end) variable++;
|
||||
|
||||
if (char_is_identifier_start(parser, variable)) {
|
||||
@ -5447,7 +5443,7 @@ lex_interpolation(yp_parser_t *parser, const char *pound) {
|
||||
// This is the character that we're going to check to see if it is the
|
||||
// start of an identifier that would indicate that this is a global
|
||||
// variable.
|
||||
const char *check = pound + 2;
|
||||
const uint8_t *check = pound + 2;
|
||||
|
||||
if (pound[2] == '-') {
|
||||
if (pound + 3 >= parser->end) {
|
||||
@ -5638,7 +5634,7 @@ parser_comment(yp_parser_t *parser, yp_comment_type_t type) {
|
||||
static yp_token_type_t
|
||||
lex_embdoc(yp_parser_t *parser) {
|
||||
// First, lex out the EMBDOC_BEGIN token.
|
||||
const char *newline = next_newline(parser->current.end, parser->end - parser->current.end);
|
||||
const uint8_t *newline = next_newline(parser->current.end, parser->end - parser->current.end);
|
||||
|
||||
if (newline == NULL) {
|
||||
parser->current.end = parser->end;
|
||||
@ -5663,7 +5659,7 @@ lex_embdoc(yp_parser_t *parser) {
|
||||
// token here.
|
||||
if (memcmp(parser->current.end, "=end", 4) == 0 &&
|
||||
(parser->current.end + 4 == parser->end || yp_char_is_whitespace(parser->current.end[4]))) {
|
||||
const char *newline = next_newline(parser->current.end, parser->end - parser->current.end);
|
||||
const uint8_t *newline = next_newline(parser->current.end, parser->end - parser->current.end);
|
||||
|
||||
if (newline == NULL) {
|
||||
parser->current.end = parser->end;
|
||||
@ -5683,7 +5679,7 @@ lex_embdoc(yp_parser_t *parser) {
|
||||
|
||||
// Otherwise, we'll parse until the end of the line and return a line of
|
||||
// embedded documentation.
|
||||
const char *newline = next_newline(parser->current.end, parser->end - parser->current.end);
|
||||
const uint8_t *newline = next_newline(parser->current.end, parser->end - parser->current.end);
|
||||
|
||||
if (newline == NULL) {
|
||||
parser->current.end = parser->end;
|
||||
@ -5833,7 +5829,7 @@ parser_lex(yp_parser_t *parser) {
|
||||
LEX(YP_TOKEN_EOF);
|
||||
|
||||
case '#': { // comments
|
||||
const char *ending = next_newline(parser->current.end, parser->end - parser->current.end);
|
||||
const uint8_t *ending = next_newline(parser->current.end, parser->end - parser->current.end);
|
||||
|
||||
parser->current.end = ending == NULL ? parser->end : ending + 1;
|
||||
parser->current.type = YP_TOKEN_COMMENT;
|
||||
@ -5902,7 +5898,7 @@ parser_lex(yp_parser_t *parser) {
|
||||
// (either . or &.) that starts the next line. If there is, then this
|
||||
// is going to become an ignored newline and we're going to instead
|
||||
// return the call operator.
|
||||
const char *next_content = parser->next_start == NULL ? parser->current.end : parser->next_start;
|
||||
const uint8_t *next_content = parser->next_start == NULL ? parser->current.end : parser->next_start;
|
||||
next_content += yp_strspn_inline_whitespace(next_content, parser->end - next_content);
|
||||
|
||||
if (next_content < parser->end) {
|
||||
@ -5913,7 +5909,7 @@ parser_lex(yp_parser_t *parser) {
|
||||
// Otherwise we'll return a regular newline.
|
||||
if (next_content[0] == '#') {
|
||||
// Here we look for a "." or "&." following a "\n".
|
||||
const char *following = next_newline(next_content, parser->end - next_content);
|
||||
const uint8_t *following = next_newline(next_content, parser->end - next_content);
|
||||
|
||||
while (following && (following + 1 < parser->end)) {
|
||||
following++;
|
||||
@ -6202,7 +6198,7 @@ parser_lex(yp_parser_t *parser) {
|
||||
!lex_state_end_p(parser) &&
|
||||
(!lex_state_p(parser, YP_LEX_STATE_ARG_ANY) || lex_state_p(parser, YP_LEX_STATE_LABELED) || space_seen)
|
||||
) {
|
||||
const char *end = parser->current.end;
|
||||
const uint8_t *end = parser->current.end;
|
||||
|
||||
yp_heredoc_quote_t quote = YP_HEREDOC_QUOTE_NONE;
|
||||
yp_heredoc_indent_t indent = YP_HEREDOC_INDENT_NONE;
|
||||
@ -6224,7 +6220,7 @@ parser_lex(yp_parser_t *parser) {
|
||||
quote = YP_HEREDOC_QUOTE_SINGLE;
|
||||
}
|
||||
|
||||
const char *ident_start = parser->current.end;
|
||||
const uint8_t *ident_start = parser->current.end;
|
||||
size_t width = 0;
|
||||
|
||||
if (parser->current.end >= parser->end) {
|
||||
@ -6247,7 +6243,7 @@ parser_lex(yp_parser_t *parser) {
|
||||
}
|
||||
|
||||
size_t ident_length = (size_t) (parser->current.end - ident_start);
|
||||
if (quote != YP_HEREDOC_QUOTE_NONE && !match(parser, (char) quote)) {
|
||||
if (quote != YP_HEREDOC_QUOTE_NONE && !match(parser, (uint8_t) quote)) {
|
||||
// TODO: handle unterminated heredoc
|
||||
}
|
||||
|
||||
@ -6263,7 +6259,7 @@ parser_lex(yp_parser_t *parser) {
|
||||
});
|
||||
|
||||
if (parser->heredoc_end == NULL) {
|
||||
const char *body_start = next_newline(parser->current.end, parser->end - parser->current.end);
|
||||
const uint8_t *body_start = next_newline(parser->current.end, parser->end - parser->current.end);
|
||||
|
||||
if (body_start == NULL) {
|
||||
// If there is no newline after the heredoc identifier, then
|
||||
@ -6905,8 +6901,8 @@ parser_lex(yp_parser_t *parser) {
|
||||
// Here we'll get a list of the places where strpbrk should break,
|
||||
// and then find the first one.
|
||||
yp_lex_mode_t *lex_mode = parser->lex_modes.current;
|
||||
const char *breakpoints = lex_mode->as.list.breakpoints;
|
||||
const char *breakpoint = yp_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end);
|
||||
const uint8_t *breakpoints = lex_mode->as.list.breakpoints;
|
||||
const uint8_t *breakpoint = yp_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end);
|
||||
|
||||
while (breakpoint != NULL) {
|
||||
// If we hit a null byte, skip directly past it.
|
||||
@ -7028,8 +7024,8 @@ parser_lex(yp_parser_t *parser) {
|
||||
// These are the places where we need to split up the content of the
|
||||
// regular expression. We'll use strpbrk to find the first of these
|
||||
// characters.
|
||||
const char *breakpoints = lex_mode->as.regexp.breakpoints;
|
||||
const char *breakpoint = yp_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end);
|
||||
const uint8_t *breakpoints = lex_mode->as.regexp.breakpoints;
|
||||
const uint8_t *breakpoint = yp_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end);
|
||||
|
||||
while (breakpoint != NULL) {
|
||||
// If we hit a null byte, skip directly past it.
|
||||
@ -7162,8 +7158,8 @@ parser_lex(yp_parser_t *parser) {
|
||||
|
||||
// These are the places where we need to split up the content of the
|
||||
// string. We'll use strpbrk to find the first of these characters.
|
||||
const char *breakpoints = parser->lex_modes.current->as.string.breakpoints;
|
||||
const char *breakpoint = yp_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end);
|
||||
const uint8_t *breakpoints = parser->lex_modes.current->as.string.breakpoints;
|
||||
const uint8_t *breakpoint = yp_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end);
|
||||
|
||||
while (breakpoint != NULL) {
|
||||
// If we hit the incrementor, then we'll increment then nesting and
|
||||
@ -7314,13 +7310,13 @@ parser_lex(yp_parser_t *parser) {
|
||||
|
||||
// Now let's grab the information about the identifier off of the current
|
||||
// lex mode.
|
||||
const char *ident_start = parser->lex_modes.current->as.heredoc.ident_start;
|
||||
const uint8_t *ident_start = parser->lex_modes.current->as.heredoc.ident_start;
|
||||
size_t ident_length = parser->lex_modes.current->as.heredoc.ident_length;
|
||||
|
||||
// If we are immediately following a newline and we have hit the
|
||||
// terminator, then we need to return the ending of the heredoc.
|
||||
if (current_token_starts_line(parser)) {
|
||||
const char *start = parser->current.start;
|
||||
const uint8_t *start = parser->current.start;
|
||||
if (parser->lex_modes.current->as.heredoc.indent != YP_HEREDOC_INDENT_NONE) {
|
||||
start += yp_strspn_inline_whitespace(start, parser->end - start);
|
||||
}
|
||||
@ -7360,14 +7356,14 @@ parser_lex(yp_parser_t *parser) {
|
||||
// Otherwise we'll be parsing string content. These are the places where
|
||||
// we need to split up the content of the heredoc. We'll use strpbrk to
|
||||
// find the first of these characters.
|
||||
char breakpoints[] = "\n\\#";
|
||||
uint8_t breakpoints[] = "\n\\#";
|
||||
|
||||
yp_heredoc_quote_t quote = parser->lex_modes.current->as.heredoc.quote;
|
||||
if (quote == YP_HEREDOC_QUOTE_SINGLE) {
|
||||
breakpoints[2] = '\0';
|
||||
}
|
||||
|
||||
const char *breakpoint = yp_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end);
|
||||
const uint8_t *breakpoint = yp_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end);
|
||||
|
||||
while (breakpoint != NULL) {
|
||||
switch (*breakpoint) {
|
||||
@ -7384,7 +7380,7 @@ parser_lex(yp_parser_t *parser) {
|
||||
|
||||
yp_newline_list_append(&parser->newline_list, breakpoint);
|
||||
|
||||
const char *start = breakpoint + 1;
|
||||
const uint8_t *start = breakpoint + 1;
|
||||
if (parser->lex_modes.current->as.heredoc.indent != YP_HEREDOC_INDENT_NONE) {
|
||||
start += yp_strspn_inline_whitespace(start, parser->end - start);
|
||||
}
|
||||
@ -7966,10 +7962,11 @@ parse_target(yp_parser_t *parser, yp_node_t *target) {
|
||||
// the previous method name in, and append an =.
|
||||
size_t length = yp_string_length(&call->name);
|
||||
|
||||
char *name = calloc(length + 2, sizeof(char));
|
||||
uint8_t *name = calloc(length + 1, sizeof(uint8_t));
|
||||
if (name == NULL) return NULL;
|
||||
|
||||
snprintf(name, length + 2, "%.*s=", (int) length, yp_string_source(&call->name));
|
||||
memcpy(name, yp_string_source(&call->name), length);
|
||||
name[length] = '=';
|
||||
|
||||
// Now switch the name to the new string.
|
||||
yp_string_free(&call->name);
|
||||
@ -8123,10 +8120,11 @@ parse_write(yp_parser_t *parser, yp_node_t *target, yp_token_t *operator, yp_nod
|
||||
// the previous method name in, and append an =.
|
||||
size_t length = yp_string_length(&call->name);
|
||||
|
||||
char *name = calloc(length + 2, sizeof(char));
|
||||
uint8_t *name = calloc(length + 1, sizeof(uint8_t));
|
||||
if (name == NULL) return NULL;
|
||||
|
||||
snprintf(name, length + 2, "%.*s=", (int) length, yp_string_source(&call->name));
|
||||
memcpy(name, yp_string_source(&call->name), length);
|
||||
name[length] = '=';
|
||||
|
||||
// Now switch the name to the new string.
|
||||
yp_string_free(&call->name);
|
||||
@ -9113,7 +9111,7 @@ parse_rescues(yp_parser_t *parser, yp_begin_node_t *parent_node) {
|
||||
// since we won't know the end until we've found all consequent
|
||||
// clauses. This sets the end location on all rescues once we know it
|
||||
if (current) {
|
||||
const char *end_to_set = current->base.location.end;
|
||||
const uint8_t *end_to_set = current->base.location.end;
|
||||
current = parent_node->rescue_clause;
|
||||
while (current) {
|
||||
current->base.location.end = end_to_set;
|
||||
@ -9170,7 +9168,7 @@ parse_rescues_as_begin(yp_parser_t *parser, yp_statements_node_t *statements) {
|
||||
// All nodes within a begin node are optional, so we look
|
||||
// for the earliest possible node that we can use to set
|
||||
// the BeginNode's start location
|
||||
const char * start = begin_node->base.location.start;
|
||||
const uint8_t *start = begin_node->base.location.start;
|
||||
if (begin_node->statements) {
|
||||
start = begin_node->statements->base.location.start;
|
||||
} else if (begin_node->rescue_clause) {
|
||||
@ -9845,7 +9843,7 @@ parse_heredoc_common_whitespace(yp_parser_t *parser, yp_node_list_t *nodes) {
|
||||
// variable.
|
||||
if (index == 0 || YP_NODE_TYPE_P(nodes->nodes[index - 1], YP_NODE_STRING_NODE)) {
|
||||
int cur_whitespace;
|
||||
const char *cur_char = content_loc->start;
|
||||
const uint8_t *cur_char = content_loc->start;
|
||||
|
||||
while (cur_char && cur_char < content_loc->end) {
|
||||
// Any empty newlines aren't included in the minimum whitespace
|
||||
@ -9936,15 +9934,15 @@ parse_heredoc_dedent(yp_parser_t *parser, yp_node_t *node, yp_heredoc_quote_t qu
|
||||
// destination to move bytes into. We'll also use it for bounds checking
|
||||
// since we don't require that these strings be null terminated.
|
||||
size_t dest_length = yp_string_length(string);
|
||||
char *source_start = string->source;
|
||||
uint8_t *source_start = (uint8_t *) string->source;
|
||||
|
||||
const char *source_cursor = source_start;
|
||||
const char *source_end = source_cursor + dest_length;
|
||||
const uint8_t *source_cursor = source_start;
|
||||
const uint8_t *source_end = source_cursor + dest_length;
|
||||
|
||||
// We're going to move bytes backward in the string when we get leading
|
||||
// whitespace, so we'll maintain a pointer to the current position in the
|
||||
// string that we're writing to.
|
||||
char *dest_cursor = source_start;
|
||||
uint8_t *dest_cursor = source_start;
|
||||
|
||||
while (source_cursor < source_end) {
|
||||
// If we need to dedent the next element within the heredoc or the next
|
||||
@ -9971,7 +9969,7 @@ parse_heredoc_dedent(yp_parser_t *parser, yp_node_t *node, yp_heredoc_quote_t qu
|
||||
|
||||
// At this point we have dedented all that we need to, so we need to find
|
||||
// the next newline.
|
||||
const char *breakpoint = next_newline(source_cursor, source_end - source_cursor);
|
||||
const uint8_t *breakpoint = next_newline(source_cursor, source_end - source_cursor);
|
||||
|
||||
if (breakpoint == NULL) {
|
||||
// If there isn't another newline, then we can just move the rest of the
|
||||
@ -13587,7 +13585,7 @@ yp_parser_metadata(yp_parser_t *parser, const char *metadata) {
|
||||
uint32_t local_size = yp_metadata_read_u32(metadata);
|
||||
metadata += 4;
|
||||
|
||||
yp_parser_local_add_location(parser, metadata, metadata + local_size);
|
||||
yp_parser_local_add_location(parser, (const uint8_t *) metadata, (const uint8_t *) (metadata + local_size));
|
||||
metadata += local_size;
|
||||
}
|
||||
}
|
||||
@ -13599,7 +13597,7 @@ yp_parser_metadata(yp_parser_t *parser, const char *metadata) {
|
||||
|
||||
// Initialize a parser with the given start and end pointers.
|
||||
YP_EXPORTED_FUNCTION void
|
||||
yp_parser_init(yp_parser_t *parser, const char *source, size_t size, const char *filepath) {
|
||||
yp_parser_init(yp_parser_t *parser, const uint8_t *source, size_t size, const char *filepath) {
|
||||
assert(source != NULL);
|
||||
|
||||
// Set filepath to the file that was passed
|
||||
@ -13671,7 +13669,7 @@ yp_parser_init(yp_parser_t *parser, const char *source, size_t size, const char
|
||||
yp_newline_list_init(&parser->newline_list, source, newline_size < 4 ? 4 : newline_size);
|
||||
|
||||
// Skip past the UTF-8 BOM if it exists.
|
||||
if (size >= 3 && (unsigned char) source[0] == 0xef && (unsigned char) source[1] == 0xbb && (unsigned char) source[2] == 0xbf) {
|
||||
if (size >= 3 && source[0] == 0xef && source[1] == 0xbb && source[2] == 0xbf) {
|
||||
parser->current.end += 3;
|
||||
parser->encoding_comment_start += 3;
|
||||
}
|
||||
@ -13679,7 +13677,7 @@ yp_parser_init(yp_parser_t *parser, const char *source, size_t size, const char
|
||||
// If the first two bytes of the source are a shebang, then we'll indicate
|
||||
// that the encoding comment is at the end of the shebang.
|
||||
if (peek(parser) == '#' && peek_offset(parser, 1) == '!') {
|
||||
const char *encoding_comment_start = next_newline(source, (ptrdiff_t) size);
|
||||
const uint8_t *encoding_comment_start = next_newline(source, (ptrdiff_t) size);
|
||||
if (encoding_comment_start) {
|
||||
parser->encoding_comment_start = encoding_comment_start + 1;
|
||||
}
|
||||
@ -13751,7 +13749,7 @@ yp_serialize(yp_parser_t *parser, yp_node_t *node, yp_buffer_t *buffer) {
|
||||
// Parse and serialize the AST represented by the given source to the given
|
||||
// buffer.
|
||||
YP_EXPORTED_FUNCTION void
|
||||
yp_parse_serialize(const char *source, size_t size, yp_buffer_t *buffer, const char *metadata) {
|
||||
yp_parse_serialize(const uint8_t *source, size_t size, yp_buffer_t *buffer, const char *metadata) {
|
||||
yp_parser_t parser;
|
||||
yp_parser_init(&parser, source, size, NULL);
|
||||
if (metadata) yp_parser_metadata(&parser, metadata);
|
||||
|
@ -40,7 +40,7 @@ void yp_scope_node_init(yp_node_t *node, yp_scope_node_t *dest);
|
||||
YP_EXPORTED_FUNCTION const char * yp_version(void);
|
||||
|
||||
// Initialize a parser with the given start and end pointers.
|
||||
YP_EXPORTED_FUNCTION void yp_parser_init(yp_parser_t *parser, const char *source, size_t size, const char *filepath);
|
||||
YP_EXPORTED_FUNCTION void yp_parser_init(yp_parser_t *parser, const uint8_t *source, size_t size, const char *filepath);
|
||||
|
||||
// Register a callback that will be called whenever YARP changes the encoding it
|
||||
// is using to parse based on the magic comment.
|
||||
@ -66,14 +66,14 @@ YP_EXPORTED_FUNCTION void yp_prettyprint(yp_parser_t *parser, yp_node_t *node, y
|
||||
YP_EXPORTED_FUNCTION void yp_serialize(yp_parser_t *parser, yp_node_t *node, yp_buffer_t *buffer);
|
||||
|
||||
// Parse the given source to the AST and serialize the AST to the given buffer.
|
||||
YP_EXPORTED_FUNCTION void yp_parse_serialize(const char *source, size_t size, yp_buffer_t *buffer, const char *metadata);
|
||||
YP_EXPORTED_FUNCTION void yp_parse_serialize(const uint8_t *source, size_t size, yp_buffer_t *buffer, const char *metadata);
|
||||
|
||||
// Lex the given source and serialize to the given buffer.
|
||||
YP_EXPORTED_FUNCTION void yp_lex_serialize(const char *source, size_t size, const char *filepath, yp_buffer_t *buffer);
|
||||
YP_EXPORTED_FUNCTION void yp_lex_serialize(const uint8_t *source, size_t size, const char *filepath, yp_buffer_t *buffer);
|
||||
|
||||
// Parse and serialize both the AST and the tokens represented by the given
|
||||
// source to the given buffer.
|
||||
YP_EXPORTED_FUNCTION void yp_parse_lex_serialize(const char *source, size_t size, yp_buffer_t *buffer, const char *metadata);
|
||||
YP_EXPORTED_FUNCTION void yp_parse_lex_serialize(const uint8_t *source, size_t size, yp_buffer_t *buffer, const char *metadata);
|
||||
|
||||
// Returns a string representation of the given token type.
|
||||
YP_EXPORTED_FUNCTION const char * yp_token_type_to_str(yp_token_type_t token_type);
|
||||
|
Loading…
x
Reference in New Issue
Block a user