* include/ruby/encoding.h (rb_econv_output): declared.
* transcode_data.h (rb_transcoder): add resetsize_func field. * enc/trans/iso2022.trans (iso2022jp_reset_sequence_size): defined. (rb_EUC_JP_to_ISO_2022_JP): provede resetsize_func. * tool/transcode-tblgen.rb: set NULL for resetsize_func. * transcode.c (rb_econv_output): new function for inserting output. (output_replacement_character): use rb_econv_output. (transcode_loop): check return value of output_replacement_character. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@18628 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
This commit is contained in:
parent
e0e39e0db8
commit
74a36d5d1f
16
ChangeLog
16
ChangeLog
@ -1,3 +1,19 @@
|
|||||||
|
Fri Aug 15 00:52:40 2008 Tanaka Akira <akr@fsij.org>
|
||||||
|
|
||||||
|
* include/ruby/encoding.h (rb_econv_output): declared.
|
||||||
|
|
||||||
|
* transcode_data.h (rb_transcoder): add resetsize_func field.
|
||||||
|
|
||||||
|
* enc/trans/iso2022.trans (iso2022jp_reset_sequence_size): defined.
|
||||||
|
(rb_EUC_JP_to_ISO_2022_JP): provede resetsize_func.
|
||||||
|
|
||||||
|
* tool/transcode-tblgen.rb: set NULL for resetsize_func.
|
||||||
|
|
||||||
|
* transcode.c (rb_econv_output): new function for inserting output.
|
||||||
|
(output_replacement_character): use rb_econv_output.
|
||||||
|
(transcode_loop): check return value of
|
||||||
|
output_replacement_character.
|
||||||
|
|
||||||
Thu Aug 14 23:47:21 2008 Tanaka Akira <akr@fsij.org>
|
Thu Aug 14 23:47:21 2008 Tanaka Akira <akr@fsij.org>
|
||||||
|
|
||||||
* include/ruby/encoding.h (ECONV_UNIVERSAL_NEWLINE_DECODER): defined.
|
* include/ruby/encoding.h (ECONV_UNIVERSAL_NEWLINE_DECODER): defined.
|
||||||
|
@ -83,7 +83,7 @@ fun_so_eucjp_to_iso2022jp(rb_transcoding *t, const unsigned char *s, size_t l, u
|
|||||||
|
|
||||||
if (t->stateful[0] == 0) {
|
if (t->stateful[0] == 0) {
|
||||||
t->stateful[0] = 1; /* initialized flag */
|
t->stateful[0] = 1; /* initialized flag */
|
||||||
t->stateful[1] = 1; /* ASCII mode */
|
t->stateful[1] = 1; /* G0 = ASCII */
|
||||||
}
|
}
|
||||||
|
|
||||||
if (l != t->stateful[1]) {
|
if (l != t->stateful[1]) {
|
||||||
@ -91,13 +91,13 @@ fun_so_eucjp_to_iso2022jp(rb_transcoding *t, const unsigned char *s, size_t l, u
|
|||||||
*o++ = 0x1b;
|
*o++ = 0x1b;
|
||||||
*o++ = '(';
|
*o++ = '(';
|
||||||
*o++ = 'B';
|
*o++ = 'B';
|
||||||
t->stateful[1] = 1;
|
t->stateful[1] = 1; /* G0 = ASCII */
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
*o++ = 0x1b;
|
*o++ = 0x1b;
|
||||||
*o++ = '$';
|
*o++ = '$';
|
||||||
*o++ = 'B';
|
*o++ = 'B';
|
||||||
t->stateful[1] = 2;
|
t->stateful[1] = 2; /* G0 = JIS X 0208 1983 */
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -112,6 +112,14 @@ fun_so_eucjp_to_iso2022jp(rb_transcoding *t, const unsigned char *s, size_t l, u
|
|||||||
return o - output0;
|
return o - output0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static int
|
||||||
|
iso2022jp_reset_sequence_size(rb_transcoding *t)
|
||||||
|
{
|
||||||
|
if (t->stateful[1] == 2)
|
||||||
|
return 3;
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
static int
|
static int
|
||||||
finish_eucjp_to_iso2022jp(rb_transcoding *t, unsigned char *o)
|
finish_eucjp_to_iso2022jp(rb_transcoding *t, unsigned char *o)
|
||||||
{
|
{
|
||||||
@ -137,7 +145,8 @@ rb_EUC_JP_to_ISO_2022_JP = {
|
|||||||
3, /* max_input */
|
3, /* max_input */
|
||||||
5, /* max_output */
|
5, /* max_output */
|
||||||
NULL, NULL, NULL, fun_so_eucjp_to_iso2022jp,
|
NULL, NULL, NULL, fun_so_eucjp_to_iso2022jp,
|
||||||
finish_eucjp_to_iso2022jp, finish_eucjp_to_iso2022jp
|
finish_eucjp_to_iso2022jp,
|
||||||
|
iso2022jp_reset_sequence_size, finish_eucjp_to_iso2022jp
|
||||||
};
|
};
|
||||||
|
|
||||||
void
|
void
|
||||||
|
@ -230,11 +230,15 @@ typedef struct {
|
|||||||
rb_encoding *destination_encoding;
|
rb_encoding *destination_encoding;
|
||||||
} rb_econv_t;
|
} rb_econv_t;
|
||||||
|
|
||||||
rb_econv_t *rb_econv_open(const char *from, const char *to, int flags);
|
rb_econv_t *rb_econv_open(const char *source_encoding, const char *destination_encoding, int flags);
|
||||||
rb_econv_result_t rb_econv_convert(rb_econv_t *ec,
|
rb_econv_result_t rb_econv_convert(rb_econv_t *ec,
|
||||||
const unsigned char **input_ptr, const unsigned char *input_stop,
|
const unsigned char **source_buffer_ptr, const unsigned char *source_buffer_end,
|
||||||
unsigned char **output_ptr, unsigned char *output_stop,
|
unsigned char **destination_buffer_ptr, unsigned char *destination_buffer_end,
|
||||||
int flags);
|
int flags);
|
||||||
|
int rb_econv_output(rb_econv_t *ec,
|
||||||
|
const unsigned char *str, size_t len,
|
||||||
|
unsigned char **destination_buffer_ptr, unsigned char *destination_buffer_end,
|
||||||
|
size_t *required_size);
|
||||||
void rb_econv_close(rb_econv_t *ec);
|
void rb_econv_close(rb_econv_t *ec);
|
||||||
|
|
||||||
/* flags for rb_econv_open */
|
/* flags for rb_econv_open */
|
||||||
|
@ -449,7 +449,8 @@ static const rb_transcoder
|
|||||||
#{input_unit_length}, /* input_unit_length */
|
#{input_unit_length}, /* input_unit_length */
|
||||||
#{max_input}, /* max_input */
|
#{max_input}, /* max_input */
|
||||||
#{max_output}, /* max_output */
|
#{max_output}, /* max_output */
|
||||||
NULL, NULL, NULL, NULL, NULL, NULL
|
NULL, NULL, NULL, NULL,
|
||||||
|
NULL, NULL, NULL
|
||||||
};
|
};
|
||||||
End
|
End
|
||||||
tree_code + "\n" + transcoder_code
|
tree_code + "\n" + transcoder_code
|
||||||
|
106
transcode.c
106
transcode.c
@ -937,6 +937,58 @@ rb_econv_convert(rb_econv_t *ec,
|
|||||||
return res;
|
return res;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int
|
||||||
|
rb_econv_output(rb_econv_t *ec,
|
||||||
|
const unsigned char *str, size_t len, /* string in destination encoding */
|
||||||
|
unsigned char **destination_buffer_ptr, unsigned char *destination_buffer_end,
|
||||||
|
size_t *required_size)
|
||||||
|
{
|
||||||
|
size_t reset_len, total_len;
|
||||||
|
rb_transcoding *tc = ec->last_tc;
|
||||||
|
const rb_transcoder *tr = tc->transcoder;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Assumption for stateful encoding:
|
||||||
|
*
|
||||||
|
* - str can be output on resetted state and doesn't change the state.
|
||||||
|
* - it is acceptable that extra state changing sequence if str contains
|
||||||
|
* a state changing sequence.
|
||||||
|
*
|
||||||
|
* Currently the replacement character for stateful encoding such as
|
||||||
|
* ISO-2022-JP is "?" and it has no state changing sequence.
|
||||||
|
* So the extra state changing sequence don't occur when
|
||||||
|
* rb_econv_output is used for replacement characters.
|
||||||
|
*
|
||||||
|
* Thease assumption may be removed in future.
|
||||||
|
* It needs to scan str to check state changing sequences in it.
|
||||||
|
*/
|
||||||
|
|
||||||
|
reset_len = 0;
|
||||||
|
if (tr->resetsize_func) {
|
||||||
|
reset_len = tr->resetsize_func(tc);
|
||||||
|
}
|
||||||
|
|
||||||
|
total_len = reset_len + len;
|
||||||
|
if (total_len < len)
|
||||||
|
return -1;
|
||||||
|
|
||||||
|
if (required_size) {
|
||||||
|
*required_size = total_len;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (destination_buffer_end - *destination_buffer_ptr < total_len)
|
||||||
|
return -1;
|
||||||
|
|
||||||
|
if (reset_len) {
|
||||||
|
*destination_buffer_ptr += tr->resetstate_func(tc, *destination_buffer_ptr);
|
||||||
|
}
|
||||||
|
|
||||||
|
memcpy(*destination_buffer_ptr, str, len);
|
||||||
|
*destination_buffer_ptr += len;
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
void
|
void
|
||||||
rb_econv_close(rb_econv_t *ec)
|
rb_econv_close(rb_econv_t *ec)
|
||||||
{
|
{
|
||||||
@ -968,58 +1020,40 @@ more_output_buffer(
|
|||||||
*out_stop_ptr = *out_start_ptr + new_len;
|
*out_stop_ptr = *out_start_ptr + new_len;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void
|
static int
|
||||||
output_replacement_character(
|
output_replacement_character(
|
||||||
VALUE destination,
|
VALUE destination,
|
||||||
unsigned char *(*resize_destination)(VALUE, int, int),
|
unsigned char *(*resize_destination)(VALUE, int, int),
|
||||||
rb_transcoding *tc,
|
rb_econv_t *ec,
|
||||||
unsigned char **out_start_ptr,
|
unsigned char **out_start_ptr,
|
||||||
unsigned char **out_pos,
|
unsigned char **out_pos,
|
||||||
unsigned char **out_stop_ptr)
|
unsigned char **out_stop_ptr)
|
||||||
|
|
||||||
{
|
{
|
||||||
|
rb_transcoding *tc = ec->last_tc;
|
||||||
const rb_transcoder *tr;
|
const rb_transcoder *tr;
|
||||||
int max_output;
|
|
||||||
rb_encoding *enc;
|
rb_encoding *enc;
|
||||||
const char *replacement;
|
const unsigned char *replacement;
|
||||||
int len;
|
int len;
|
||||||
|
size_t required_size;
|
||||||
|
|
||||||
tr = tc->transcoder;
|
tr = tc->transcoder;
|
||||||
max_output = tr->max_output;
|
|
||||||
enc = rb_enc_find(tr->to_encoding);
|
enc = rb_enc_find(tr->to_encoding);
|
||||||
|
|
||||||
/*
|
replacement = (const unsigned char *)get_replacement_character(enc, &len);
|
||||||
* Assumption for stateful encoding:
|
|
||||||
*
|
|
||||||
* - The replacement character can be output on resetted state and doesn't
|
|
||||||
* change the state.
|
|
||||||
* - it is acceptable that extra state changing sequence if the replacement
|
|
||||||
* character contains a state changing sequence.
|
|
||||||
*
|
|
||||||
* Currently the replacement character for stateful encoding such as
|
|
||||||
* ISO-2022-JP is "?" and it has no state changing sequence.
|
|
||||||
* So the extra state changing sequence don't occur.
|
|
||||||
*
|
|
||||||
* Thease assumption may be removed in future.
|
|
||||||
* It needs to scan the replacement character to check
|
|
||||||
* state changing sequences in the replacement character.
|
|
||||||
*/
|
|
||||||
|
|
||||||
if (tr->resetstate_func) {
|
if (rb_econv_output(ec, replacement, len, out_pos, *out_stop_ptr, &required_size) == 0)
|
||||||
if (*out_stop_ptr - *out_pos < max_output)
|
return 0;
|
||||||
more_output_buffer(destination, resize_destination, max_output, out_start_ptr, out_pos, out_stop_ptr);
|
|
||||||
*out_pos += tr->resetstate_func(tc, *out_pos);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (*out_stop_ptr - *out_pos < max_output)
|
if (required_size < len)
|
||||||
more_output_buffer(destination, resize_destination, max_output, out_start_ptr, out_pos, out_stop_ptr);
|
return -1; /* overflow */
|
||||||
|
|
||||||
replacement = get_replacement_character(enc, &len);
|
more_output_buffer(destination, resize_destination, required_size, out_start_ptr, out_pos, out_stop_ptr);
|
||||||
|
|
||||||
memcpy(*out_pos, replacement, len);
|
if (rb_econv_output(ec, replacement, len, out_pos, *out_stop_ptr, &required_size) == 0)
|
||||||
|
return 0;
|
||||||
|
|
||||||
*out_pos += len;
|
return -1;
|
||||||
return;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#if 1
|
#if 1
|
||||||
@ -1054,7 +1088,7 @@ resume:
|
|||||||
goto resume;
|
goto resume;
|
||||||
}
|
}
|
||||||
else if (opt&INVALID_REPLACE) {
|
else if (opt&INVALID_REPLACE) {
|
||||||
output_replacement_character(destination, resize_destination, last_tc, &out_start, out_pos, &out_stop);
|
if (output_replacement_character(destination, resize_destination, ec, &out_start, out_pos, &out_stop) == 0)
|
||||||
goto resume;
|
goto resume;
|
||||||
}
|
}
|
||||||
rb_econv_close(ec);
|
rb_econv_close(ec);
|
||||||
@ -1068,7 +1102,7 @@ resume:
|
|||||||
goto resume;
|
goto resume;
|
||||||
}
|
}
|
||||||
else if (opt&UNDEF_REPLACE) {
|
else if (opt&UNDEF_REPLACE) {
|
||||||
output_replacement_character(destination, resize_destination, last_tc, &out_start, out_pos, &out_stop);
|
if (output_replacement_character(destination, resize_destination, ec, &out_start, out_pos, &out_stop) == 0)
|
||||||
goto resume;
|
goto resume;
|
||||||
}
|
}
|
||||||
rb_econv_close(ec);
|
rb_econv_close(ec);
|
||||||
@ -1135,7 +1169,7 @@ transcode_loop(const unsigned char **in_pos, unsigned char **out_pos,
|
|||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
else if (opt&INVALID_REPLACE) {
|
else if (opt&INVALID_REPLACE) {
|
||||||
output_replacement_character(destination, resize_destination, last_tc, &out_start, out_pos, &out_stop);
|
if (output_replacement_character(destination, resize_destination, ec, &out_start, out_pos, &out_stop) == 0)
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
rb_econv_close(ec);
|
rb_econv_close(ec);
|
||||||
@ -1150,7 +1184,7 @@ transcode_loop(const unsigned char **in_pos, unsigned char **out_pos,
|
|||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
else if (opt&UNDEF_REPLACE) {
|
else if (opt&UNDEF_REPLACE) {
|
||||||
output_replacement_character(destination, resize_destination, last_tc, &out_start, out_pos, &out_stop);
|
if (output_replacement_character(destination, resize_destination, ec, &out_start, out_pos, &out_stop) == 0)
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
rb_econv_close(ec);
|
rb_econv_close(ec);
|
||||||
|
@ -107,8 +107,9 @@ struct rb_transcoder {
|
|||||||
VALUE (*func_si)(rb_transcoding*, const unsigned char*, size_t); /* start -> info */
|
VALUE (*func_si)(rb_transcoding*, const unsigned char*, size_t); /* start -> info */
|
||||||
int (*func_io)(rb_transcoding*, VALUE, const unsigned char*); /* info -> output */
|
int (*func_io)(rb_transcoding*, VALUE, const unsigned char*); /* info -> output */
|
||||||
int (*func_so)(rb_transcoding*, const unsigned char*, size_t, unsigned char*); /* start -> output */
|
int (*func_so)(rb_transcoding*, const unsigned char*, size_t, unsigned char*); /* start -> output */
|
||||||
int (*resetstate_func)(rb_transcoding*, unsigned char*); /* -> output */
|
|
||||||
int (*finish_func)(rb_transcoding*, unsigned char*); /* -> output */
|
int (*finish_func)(rb_transcoding*, unsigned char*); /* -> output */
|
||||||
|
int (*resetsize_func)(rb_transcoding*); /* -> len */
|
||||||
|
int (*resetstate_func)(rb_transcoding*, unsigned char*); /* -> output */
|
||||||
};
|
};
|
||||||
|
|
||||||
void rb_declare_transcoder(const char *enc1, const char *enc2, const char *lib);
|
void rb_declare_transcoder(const char *enc1, const char *enc2, const char *lib);
|
||||||
|
Loading…
x
Reference in New Issue
Block a user