* tool/transcode-tblgen.rb: distinguish UNDEF and INVALID.
[ruby-dev:35709] * transcode.c (transcode_loop): don't need rb_enc_mbclen now. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@18390 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
This commit is contained in:
parent
fc841ddc66
commit
5adb247914
@ -1,3 +1,10 @@
|
|||||||
|
Wed Aug 6 20:44:41 2008 Tanaka Akira <akr@fsij.org>
|
||||||
|
|
||||||
|
* tool/transcode-tblgen.rb: distinguish UNDEF and INVALID.
|
||||||
|
[ruby-dev:35709]
|
||||||
|
|
||||||
|
* transcode.c (transcode_loop): don't need rb_enc_mbclen now.
|
||||||
|
|
||||||
Wed Aug 6 14:40:11 2008 Nobuyoshi Nakada <nobu@ruby-lang.org>
|
Wed Aug 6 14:40:11 2008 Nobuyoshi Nakada <nobu@ruby-lang.org>
|
||||||
|
|
||||||
* common.mk (transdb.h): requires transcoders.
|
* common.mk (transdb.h): requires transcoders.
|
||||||
|
@ -17,6 +17,9 @@ end
|
|||||||
|
|
||||||
class StrSet
|
class StrSet
|
||||||
def self.parse(pattern)
|
def self.parse(pattern)
|
||||||
|
if /\A\s*(([0-9a-f][0-9a-f]|\{([0-9a-f][0-9a-f]|[0-9a-f][0-9a-f]-[0-9a-f][0-9a-f])(,([0-9a-f][0-9a-f]|[0-9a-f][0-9a-f]-[0-9a-f][0-9a-f]))*\})+(\s+|\z))*\z/i !~ pattern
|
||||||
|
raise ArgumentError, "invalid pattern: #{pattern.inspect}"
|
||||||
|
end
|
||||||
result = []
|
result = []
|
||||||
pattern.scan(/\S+/) {|seq|
|
pattern.scan(/\S+/) {|seq|
|
||||||
seq_result = []
|
seq_result = []
|
||||||
@ -69,25 +72,27 @@ class StrSet
|
|||||||
def to_s
|
def to_s
|
||||||
if @pat.empty?
|
if @pat.empty?
|
||||||
"(empset)"
|
"(empset)"
|
||||||
elsif @pat == [[]]
|
|
||||||
"(empstr)"
|
|
||||||
else
|
else
|
||||||
@pat.map {|seq|
|
@pat.map {|seq|
|
||||||
seq.map {|byteset|
|
if seq.empty?
|
||||||
if byteset.length == 1 && byteset[0].begin == byteset[0].end
|
"(empstr)"
|
||||||
"%02x" % byteset[0].begin
|
else
|
||||||
else
|
seq.map {|byteset|
|
||||||
"{" +
|
if byteset.length == 1 && byteset[0].begin == byteset[0].end
|
||||||
byteset.map {|range|
|
"%02x" % byteset[0].begin
|
||||||
if range.begin == range.end
|
else
|
||||||
"%02x" % range.begin
|
"{" +
|
||||||
else
|
byteset.map {|range|
|
||||||
"%02x-%02x" % [range.begin, range.end]
|
if range.begin == range.end
|
||||||
end
|
"%02x" % range.begin
|
||||||
}.join(',') +
|
else
|
||||||
"}"
|
"%02x-%02x" % [range.begin, range.end]
|
||||||
end
|
end
|
||||||
}.join('')
|
}.join(',') +
|
||||||
|
"}"
|
||||||
|
end
|
||||||
|
}.join('')
|
||||||
|
end
|
||||||
}.join(' ')
|
}.join(' ')
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
@ -142,9 +147,7 @@ class ActionMap
|
|||||||
|
|
||||||
def initialize(h)
|
def initialize(h)
|
||||||
@map = h
|
@map = h
|
||||||
@default_action = :undef
|
|
||||||
end
|
end
|
||||||
attr_accessor :default_action
|
|
||||||
|
|
||||||
def hash
|
def hash
|
||||||
hash = 0
|
hash = 0
|
||||||
@ -174,7 +177,7 @@ class ActionMap
|
|||||||
nil
|
nil
|
||||||
end
|
end
|
||||||
|
|
||||||
def each_firstbyte
|
def each_firstbyte(valid_encoding=nil)
|
||||||
h = {}
|
h = {}
|
||||||
@map.each {|ss, action|
|
@map.each {|ss, action|
|
||||||
if ss.emptyable?
|
if ss.emptyable?
|
||||||
@ -184,17 +187,27 @@ class ActionMap
|
|||||||
h[byte] ||= {}
|
h[byte] ||= {}
|
||||||
if h[byte][rest]
|
if h[byte][rest]
|
||||||
raise "ambiguous"
|
raise "ambiguous"
|
||||||
else
|
|
||||||
h[byte][rest] = action
|
|
||||||
end
|
end
|
||||||
|
h[byte][rest] = action
|
||||||
}
|
}
|
||||||
end
|
end
|
||||||
}
|
}
|
||||||
h.keys.sort.each {|byte|
|
if valid_encoding
|
||||||
am = ActionMap.new(h[byte])
|
valid_encoding.each_firstbyte {|byte, rest|
|
||||||
am.default_action = @default_action
|
if h[byte]
|
||||||
yield byte, am
|
am = ActionMap.new(h[byte])
|
||||||
}
|
yield byte, am, rest
|
||||||
|
else
|
||||||
|
am = ActionMap.new(rest => :undef)
|
||||||
|
yield byte, am, nil
|
||||||
|
end
|
||||||
|
}
|
||||||
|
else
|
||||||
|
h.keys.sort.each {|byte|
|
||||||
|
am = ActionMap.new(h[byte])
|
||||||
|
yield byte, am, nil
|
||||||
|
}
|
||||||
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
OffsetsMemo = {}
|
OffsetsMemo = {}
|
||||||
@ -257,24 +270,14 @@ class ActionMap
|
|||||||
offsets = []
|
offsets = []
|
||||||
infos = []
|
infos = []
|
||||||
infomap = {}
|
infomap = {}
|
||||||
noaction_bytes = []
|
|
||||||
table.each_with_index {|action, byte|
|
table.each_with_index {|action, byte|
|
||||||
if !action
|
action ||= :invalid
|
||||||
noaction_bytes << byte
|
|
||||||
next
|
|
||||||
end
|
|
||||||
unless o = infomap[action]
|
unless o = infomap[action]
|
||||||
infomap[action] = o = infos.length
|
infomap[action] = o = infos.length
|
||||||
infos[o] = action
|
infos[o] = action
|
||||||
end
|
end
|
||||||
offsets[byte] = o
|
offsets[byte] = o
|
||||||
}
|
}
|
||||||
if !noaction_bytes.empty?
|
|
||||||
noaction_bytes.each {|byte|
|
|
||||||
offsets[byte] = infos.length
|
|
||||||
}
|
|
||||||
infos << @default_action
|
|
||||||
end
|
|
||||||
|
|
||||||
if n = OffsetsMemo[offsets]
|
if n = OffsetsMemo[offsets]
|
||||||
offsets_name = n
|
offsets_name = n
|
||||||
@ -315,15 +318,15 @@ End
|
|||||||
PostMemo = {}
|
PostMemo = {}
|
||||||
NextName = "a"
|
NextName = "a"
|
||||||
|
|
||||||
def generate_node(code, name_hint=nil, ranges=[])
|
def generate_node(code, name_hint=nil, ranges=[], valid_encoding=nil)
|
||||||
ranges = [0x00..0xff] if ranges.empty?
|
ranges = [0x00..0xff] if ranges.empty?
|
||||||
range = ranges.first
|
range = ranges.first
|
||||||
if n = PreMemo[self]
|
if n = PreMemo[[self,valid_encoding]]
|
||||||
return n
|
return n
|
||||||
end
|
end
|
||||||
|
|
||||||
table = Array.new(range.end - range.begin + 1)
|
table = Array.new(range.end - range.begin + 1)
|
||||||
each_firstbyte {|byte, rest|
|
each_firstbyte(valid_encoding) {|byte, rest, rest_valid_encoding|
|
||||||
unless range === byte
|
unless range === byte
|
||||||
raise "byte not in range"
|
raise "byte not in range"
|
||||||
end
|
end
|
||||||
@ -332,7 +335,7 @@ End
|
|||||||
else
|
else
|
||||||
name_hint2 = nil
|
name_hint2 = nil
|
||||||
name_hint2 = "#{name_hint}_#{'%02X' % byte}" if name_hint
|
name_hint2 = "#{name_hint}_#{'%02X' % byte}" if name_hint
|
||||||
table[byte-range.begin] = "&" + rest.generate_node(code, name_hint2, ranges[1..-1])
|
table[byte-range.begin] = "&" + rest.generate_node(code, name_hint2, ranges[1..-1], rest_valid_encoding)
|
||||||
end
|
end
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -345,7 +348,7 @@ End
|
|||||||
NextName.succ!
|
NextName.succ!
|
||||||
end
|
end
|
||||||
|
|
||||||
PreMemo[self] = PostMemo[table] = name_hint
|
PreMemo[[self,valid_encoding]] = PostMemo[table] = name_hint
|
||||||
|
|
||||||
code << generate_lookup_node(name_hint, table)
|
code << generate_lookup_node(name_hint, table)
|
||||||
name_hint
|
name_hint
|
||||||
@ -371,9 +374,15 @@ def transcode_compile_tree(name, from, map)
|
|||||||
}
|
}
|
||||||
am = ActionMap.parse(h)
|
am = ActionMap.parse(h)
|
||||||
|
|
||||||
|
if ValidEncoding[from]
|
||||||
|
valid_encoding = StrSet.parse(ValidEncoding[from])
|
||||||
|
else
|
||||||
|
valid_encoding = nil
|
||||||
|
end
|
||||||
|
|
||||||
ranges = from == "UTF-8" ? [0x00..0xff, 0x80..0xbf, 0x80..0xbf, 0x80..0xbf] : []
|
ranges = from == "UTF-8" ? [0x00..0xff, 0x80..0xbf, 0x80..0xbf, 0x80..0xbf] : []
|
||||||
code = ''
|
code = ''
|
||||||
defined_name = am.generate_node(code, name, ranges)
|
defined_name = am.generate_node(code, name, ranges, valid_encoding)
|
||||||
return defined_name, code
|
return defined_name, code
|
||||||
end
|
end
|
||||||
|
|
||||||
@ -419,75 +428,72 @@ def transcode_register_code
|
|||||||
code
|
code
|
||||||
end
|
end
|
||||||
|
|
||||||
Universe = {
|
ValidEncoding = {
|
||||||
"singlebyte" => "{00-ff}",
|
'1byte' => '{00-ff}',
|
||||||
"doublebyte" => "{00-ff}{00-ff}",
|
'2byte' => '{00-ff}{00-ff}',
|
||||||
"quadruplebyte" => "{00-ff}{00-ff}{00-ff}{00-ff}",
|
'4byte' => '{00-ff}{00-ff}{00-ff}{00-ff}',
|
||||||
"US-ASCII" => "{00-7f}",
|
'US-ASCII' => '{00-7f}',
|
||||||
"EUC-JP" => <<-End,
|
'UTF-8' => '{00-7f}
|
||||||
{00-7f}
|
{c2-df}{80-bf}
|
||||||
{a1-fe}{a1-fe}
|
e0{a0-bf}{80-bf}
|
||||||
8e{a1-fe}
|
{e1-ec}{80-bf}{80-bf}
|
||||||
8f{a1-fe}{a1-fe}
|
ed{80-9f}{80-bf}
|
||||||
End
|
{ee-ef}{80-bf}{80-bf}
|
||||||
"EUC-KR" => <<-End,
|
f0{90-bf}{80-bf}{80-bf}
|
||||||
{00-7f}
|
{f1-f3}{80-bf}{80-bf}{80-bf}
|
||||||
{a1-fe}{a1-fe}
|
f4{80-8f}{80-bf}{80-bf}',
|
||||||
End
|
'UTF-16BE' => '{00-d7,e0-ff}{00-ff}
|
||||||
"EUC-TW" => <<-End,
|
{d8-db}{00-ff}{dc-df}{00-ff}',
|
||||||
{00-7f}
|
'UTF-16LE' => '{00-ff}{00-d7,e0-ff}
|
||||||
{a1-fe}{a1-fe}
|
{00-ff}{d8-db}{00-ff}{dc-df}',
|
||||||
8e{a1-b0}{a1-fe}{a1-fe}
|
'UTF-32BE' => '0000{00-d7,e0-ff}{00-ff}
|
||||||
End
|
00{01-10}{00-ff}{00-ff}',
|
||||||
"Shift_JIS" => <<-End,
|
'UTF-32LE' => '{00-ff}{00-d7,e0-ff}0000
|
||||||
{00-7f}
|
{00-ff}{00-ff}{01-10}00',
|
||||||
{81-9f,e0-fc}{40-7e,80-fc}
|
'EUC-JP' => '{00-7f}
|
||||||
{a1-df}
|
{a1-fe}{a1-fe}
|
||||||
End
|
8e{a1-fe}
|
||||||
"Big5" => <<-End,
|
8f{a1-fe}{a1-fe}',
|
||||||
{00-7f}
|
'CP51932' => '{00-7f}
|
||||||
{a1-fe}{40-7e,a1-fe}
|
{a1-fe}{a1-fe}
|
||||||
End
|
8e{a1-fe}',
|
||||||
"GBK" => <<-End,
|
'Shift_JIS' => '{00-7f}
|
||||||
{00-80}
|
{81-9f,e0-fc}{40-7e,80-fc}
|
||||||
{81-fe}{40-7e,80-fe}
|
{a1-df}',
|
||||||
End
|
'EUC-KR' => '{00-7f}
|
||||||
"CP949" => <<-End,
|
{a1-fe}{a1-fe}',
|
||||||
{00-80}
|
'CP949' => '{00-7f}
|
||||||
{81-fe}{41-5a,61-7a,81-fe}
|
{81-fe}{41-5a,61-7a,81-fe}',
|
||||||
End
|
'Big5' => '{00-7f}
|
||||||
"UTF-8" => <<-End,
|
{81-fe}{40-7e,a1-fe}',
|
||||||
{00-7f}
|
'EUC-TW' => '{00-7f}
|
||||||
{c2-df}{80-bf}
|
{a1-fe}{a1-fe}
|
||||||
e0{a0-bf}{80-bf}
|
8e{a1-b0}{a1-fe}{a1-fe}',
|
||||||
{e1-ec}{80-bf}{80-bf}
|
'GBK' => '{00-80}
|
||||||
ed{80-9f}{80-bf}
|
{81-fe}{40-7e,80-fe}',
|
||||||
{ee-ef}{80-bf}{80-bf}
|
'GB18030' => '{00-7f}
|
||||||
f0{90-bf}{80-bf}{80-bf}
|
{81-fe}{40-7e,80-fe}
|
||||||
{f1-f3}{80-bf}{80-bf}{80-bf}
|
{81-fe}{30-39}{81-fe}{30-39}',
|
||||||
f4{80-8f}{80-bf}{80-bf}
|
}
|
||||||
End
|
|
||||||
"GB18030" => <<-End,
|
{
|
||||||
{00-7f}
|
'ISO-8859-1' => '1byte',
|
||||||
{81-fe}{40-7e,80-fe}
|
'ISO-8859-2' => '1byte',
|
||||||
{81-fe}{30-93}{81-fe}{30-93}
|
'ISO-8859-3' => '1byte',
|
||||||
End
|
'ISO-8859-4' => '1byte',
|
||||||
"UTF-16BE" => <<-End,
|
'ISO-8859-5' => '1byte',
|
||||||
{00-d7,e0-ff}{00-ff}
|
'ISO-8859-6' => '1byte',
|
||||||
{d8-db}{00-ff}{dc-df}{00-ff}
|
'ISO-8859-7' => '1byte',
|
||||||
End
|
'ISO-8859-8' => '1byte',
|
||||||
"UTF-16LE" => <<-End,
|
'ISO-8859-9' => '1byte',
|
||||||
{00-ff}{00-d7,e0-ff}
|
'ISO-8859-10' => '1byte',
|
||||||
{00-ff}{d8-db}{00-ff}{dc-df}
|
'ISO-8859-11' => '1byte',
|
||||||
End
|
'ISO-8859-13' => '1byte',
|
||||||
"UTF-32BE" => <<-End,
|
'ISO-8859-14' => '1byte',
|
||||||
0000{00-d7,e0-ff}{00-ff}
|
'ISO-8859-15' => '1byte',
|
||||||
00{01-10}{00-ff}{00-ff}
|
'Windows-31J' => 'Shift_JIS',
|
||||||
End
|
}.each {|k, v|
|
||||||
"UTF-32LE" => <<-End,
|
ValidEncoding[k] = ValidEncoding.fetch(v)
|
||||||
{00-ff}{00-d7,e0-ff}0000
|
|
||||||
{00-ff}{00-ff}{01-10}00
|
|
||||||
End
|
|
||||||
}
|
}
|
||||||
|
|
||||||
def make_signature(filename, src)
|
def make_signature(filename, src)
|
||||||
@ -528,7 +534,7 @@ if !force_mode && output_filename && File.readable?(output_filename)
|
|||||||
end
|
end
|
||||||
end
|
end
|
||||||
}
|
}
|
||||||
if old_signature == chk_signature
|
if old_signature == chk_signature && File.mtime(__FILE__) < File.mtime(output_filename)
|
||||||
now = Time.now
|
now = Time.now
|
||||||
File.utime(now, now, output_filename)
|
File.utime(now, now, output_filename)
|
||||||
STDERR.puts "already up-to-date: #{output_filename}" if VERBOSE_MODE
|
STDERR.puts "already up-to-date: #{output_filename}" if VERBOSE_MODE
|
||||||
|
@ -188,7 +188,6 @@ transcode_loop(const unsigned char **in_pos, unsigned char **out_pos,
|
|||||||
unsigned char next_byte;
|
unsigned char next_byte;
|
||||||
int from_utf8 = my_transcoder->from_utf8;
|
int from_utf8 = my_transcoder->from_utf8;
|
||||||
unsigned char *out_s = out_stop - my_transcoder->max_output + 1;
|
unsigned char *out_s = out_stop - my_transcoder->max_output + 1;
|
||||||
rb_encoding *from_encoding = rb_enc_find(my_transcoder->from_encoding);
|
|
||||||
rb_encoding *to_encoding = rb_enc_find(my_transcoder->to_encoding);
|
rb_encoding *to_encoding = rb_enc_find(my_transcoder->to_encoding);
|
||||||
|
|
||||||
while (in_p < in_stop) {
|
while (in_p < in_stop) {
|
||||||
@ -280,10 +279,6 @@ transcode_loop(const unsigned char **in_pos, unsigned char **out_pos,
|
|||||||
/* valid character in from encoding
|
/* valid character in from encoding
|
||||||
* but no related character(s) in to encoding */
|
* but no related character(s) in to encoding */
|
||||||
/* todo: add more alternative behaviors */
|
/* todo: add more alternative behaviors */
|
||||||
{
|
|
||||||
int len = rb_enc_mbclen((const char *)char_start, (const char *)in_stop, from_encoding);
|
|
||||||
while (in_p < char_start + len) in_p++;
|
|
||||||
}
|
|
||||||
if (opt&UNDEF_IGNORE) {
|
if (opt&UNDEF_IGNORE) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user