* tool/transcode-tblgen.rb: distinguish UNDEF and INVALID.

[ruby-dev:35709]

* transcode.c (transcode_loop): don't need rb_enc_mbclen now.


git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@18390 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
This commit is contained in:
akr 2008-08-06 11:47:14 +00:00
parent fc841ddc66
commit 5adb247914
3 changed files with 127 additions and 119 deletions

View File

@ -1,3 +1,10 @@
Wed Aug 6 20:44:41 2008 Tanaka Akira <akr@fsij.org>
* tool/transcode-tblgen.rb: distinguish UNDEF and INVALID.
[ruby-dev:35709]
* transcode.c (transcode_loop): don't need rb_enc_mbclen now.
Wed Aug 6 14:40:11 2008 Nobuyoshi Nakada <nobu@ruby-lang.org> Wed Aug 6 14:40:11 2008 Nobuyoshi Nakada <nobu@ruby-lang.org>
* common.mk (transdb.h): requires transcoders. * common.mk (transdb.h): requires transcoders.

View File

@ -17,6 +17,9 @@ end
class StrSet class StrSet
def self.parse(pattern) def self.parse(pattern)
if /\A\s*(([0-9a-f][0-9a-f]|\{([0-9a-f][0-9a-f]|[0-9a-f][0-9a-f]-[0-9a-f][0-9a-f])(,([0-9a-f][0-9a-f]|[0-9a-f][0-9a-f]-[0-9a-f][0-9a-f]))*\})+(\s+|\z))*\z/i !~ pattern
raise ArgumentError, "invalid pattern: #{pattern.inspect}"
end
result = [] result = []
pattern.scan(/\S+/) {|seq| pattern.scan(/\S+/) {|seq|
seq_result = [] seq_result = []
@ -69,25 +72,27 @@ class StrSet
def to_s def to_s
if @pat.empty? if @pat.empty?
"(empset)" "(empset)"
elsif @pat == [[]]
"(empstr)"
else else
@pat.map {|seq| @pat.map {|seq|
seq.map {|byteset| if seq.empty?
if byteset.length == 1 && byteset[0].begin == byteset[0].end "(empstr)"
"%02x" % byteset[0].begin else
else seq.map {|byteset|
"{" + if byteset.length == 1 && byteset[0].begin == byteset[0].end
byteset.map {|range| "%02x" % byteset[0].begin
if range.begin == range.end else
"%02x" % range.begin "{" +
else byteset.map {|range|
"%02x-%02x" % [range.begin, range.end] if range.begin == range.end
end "%02x" % range.begin
}.join(',') + else
"}" "%02x-%02x" % [range.begin, range.end]
end end
}.join('') }.join(',') +
"}"
end
}.join('')
end
}.join(' ') }.join(' ')
end end
end end
@ -142,9 +147,7 @@ class ActionMap
def initialize(h) def initialize(h)
@map = h @map = h
@default_action = :undef
end end
attr_accessor :default_action
def hash def hash
hash = 0 hash = 0
@ -174,7 +177,7 @@ class ActionMap
nil nil
end end
def each_firstbyte def each_firstbyte(valid_encoding=nil)
h = {} h = {}
@map.each {|ss, action| @map.each {|ss, action|
if ss.emptyable? if ss.emptyable?
@ -184,17 +187,27 @@ class ActionMap
h[byte] ||= {} h[byte] ||= {}
if h[byte][rest] if h[byte][rest]
raise "ambiguous" raise "ambiguous"
else
h[byte][rest] = action
end end
h[byte][rest] = action
} }
end end
} }
h.keys.sort.each {|byte| if valid_encoding
am = ActionMap.new(h[byte]) valid_encoding.each_firstbyte {|byte, rest|
am.default_action = @default_action if h[byte]
yield byte, am am = ActionMap.new(h[byte])
} yield byte, am, rest
else
am = ActionMap.new(rest => :undef)
yield byte, am, nil
end
}
else
h.keys.sort.each {|byte|
am = ActionMap.new(h[byte])
yield byte, am, nil
}
end
end end
OffsetsMemo = {} OffsetsMemo = {}
@ -257,24 +270,14 @@ class ActionMap
offsets = [] offsets = []
infos = [] infos = []
infomap = {} infomap = {}
noaction_bytes = []
table.each_with_index {|action, byte| table.each_with_index {|action, byte|
if !action action ||= :invalid
noaction_bytes << byte
next
end
unless o = infomap[action] unless o = infomap[action]
infomap[action] = o = infos.length infomap[action] = o = infos.length
infos[o] = action infos[o] = action
end end
offsets[byte] = o offsets[byte] = o
} }
if !noaction_bytes.empty?
noaction_bytes.each {|byte|
offsets[byte] = infos.length
}
infos << @default_action
end
if n = OffsetsMemo[offsets] if n = OffsetsMemo[offsets]
offsets_name = n offsets_name = n
@ -315,15 +318,15 @@ End
PostMemo = {} PostMemo = {}
NextName = "a" NextName = "a"
def generate_node(code, name_hint=nil, ranges=[]) def generate_node(code, name_hint=nil, ranges=[], valid_encoding=nil)
ranges = [0x00..0xff] if ranges.empty? ranges = [0x00..0xff] if ranges.empty?
range = ranges.first range = ranges.first
if n = PreMemo[self] if n = PreMemo[[self,valid_encoding]]
return n return n
end end
table = Array.new(range.end - range.begin + 1) table = Array.new(range.end - range.begin + 1)
each_firstbyte {|byte, rest| each_firstbyte(valid_encoding) {|byte, rest, rest_valid_encoding|
unless range === byte unless range === byte
raise "byte not in range" raise "byte not in range"
end end
@ -332,7 +335,7 @@ End
else else
name_hint2 = nil name_hint2 = nil
name_hint2 = "#{name_hint}_#{'%02X' % byte}" if name_hint name_hint2 = "#{name_hint}_#{'%02X' % byte}" if name_hint
table[byte-range.begin] = "&" + rest.generate_node(code, name_hint2, ranges[1..-1]) table[byte-range.begin] = "&" + rest.generate_node(code, name_hint2, ranges[1..-1], rest_valid_encoding)
end end
} }
@ -345,7 +348,7 @@ End
NextName.succ! NextName.succ!
end end
PreMemo[self] = PostMemo[table] = name_hint PreMemo[[self,valid_encoding]] = PostMemo[table] = name_hint
code << generate_lookup_node(name_hint, table) code << generate_lookup_node(name_hint, table)
name_hint name_hint
@ -371,9 +374,15 @@ def transcode_compile_tree(name, from, map)
} }
am = ActionMap.parse(h) am = ActionMap.parse(h)
if ValidEncoding[from]
valid_encoding = StrSet.parse(ValidEncoding[from])
else
valid_encoding = nil
end
ranges = from == "UTF-8" ? [0x00..0xff, 0x80..0xbf, 0x80..0xbf, 0x80..0xbf] : [] ranges = from == "UTF-8" ? [0x00..0xff, 0x80..0xbf, 0x80..0xbf, 0x80..0xbf] : []
code = '' code = ''
defined_name = am.generate_node(code, name, ranges) defined_name = am.generate_node(code, name, ranges, valid_encoding)
return defined_name, code return defined_name, code
end end
@ -419,75 +428,72 @@ def transcode_register_code
code code
end end
Universe = { ValidEncoding = {
"singlebyte" => "{00-ff}", '1byte' => '{00-ff}',
"doublebyte" => "{00-ff}{00-ff}", '2byte' => '{00-ff}{00-ff}',
"quadruplebyte" => "{00-ff}{00-ff}{00-ff}{00-ff}", '4byte' => '{00-ff}{00-ff}{00-ff}{00-ff}',
"US-ASCII" => "{00-7f}", 'US-ASCII' => '{00-7f}',
"EUC-JP" => <<-End, 'UTF-8' => '{00-7f}
{00-7f} {c2-df}{80-bf}
{a1-fe}{a1-fe} e0{a0-bf}{80-bf}
8e{a1-fe} {e1-ec}{80-bf}{80-bf}
8f{a1-fe}{a1-fe} ed{80-9f}{80-bf}
End {ee-ef}{80-bf}{80-bf}
"EUC-KR" => <<-End, f0{90-bf}{80-bf}{80-bf}
{00-7f} {f1-f3}{80-bf}{80-bf}{80-bf}
{a1-fe}{a1-fe} f4{80-8f}{80-bf}{80-bf}',
End 'UTF-16BE' => '{00-d7,e0-ff}{00-ff}
"EUC-TW" => <<-End, {d8-db}{00-ff}{dc-df}{00-ff}',
{00-7f} 'UTF-16LE' => '{00-ff}{00-d7,e0-ff}
{a1-fe}{a1-fe} {00-ff}{d8-db}{00-ff}{dc-df}',
8e{a1-b0}{a1-fe}{a1-fe} 'UTF-32BE' => '0000{00-d7,e0-ff}{00-ff}
End 00{01-10}{00-ff}{00-ff}',
"Shift_JIS" => <<-End, 'UTF-32LE' => '{00-ff}{00-d7,e0-ff}0000
{00-7f} {00-ff}{00-ff}{01-10}00',
{81-9f,e0-fc}{40-7e,80-fc} 'EUC-JP' => '{00-7f}
{a1-df} {a1-fe}{a1-fe}
End 8e{a1-fe}
"Big5" => <<-End, 8f{a1-fe}{a1-fe}',
{00-7f} 'CP51932' => '{00-7f}
{a1-fe}{40-7e,a1-fe} {a1-fe}{a1-fe}
End 8e{a1-fe}',
"GBK" => <<-End, 'Shift_JIS' => '{00-7f}
{00-80} {81-9f,e0-fc}{40-7e,80-fc}
{81-fe}{40-7e,80-fe} {a1-df}',
End 'EUC-KR' => '{00-7f}
"CP949" => <<-End, {a1-fe}{a1-fe}',
{00-80} 'CP949' => '{00-7f}
{81-fe}{41-5a,61-7a,81-fe} {81-fe}{41-5a,61-7a,81-fe}',
End 'Big5' => '{00-7f}
"UTF-8" => <<-End, {81-fe}{40-7e,a1-fe}',
{00-7f} 'EUC-TW' => '{00-7f}
{c2-df}{80-bf} {a1-fe}{a1-fe}
e0{a0-bf}{80-bf} 8e{a1-b0}{a1-fe}{a1-fe}',
{e1-ec}{80-bf}{80-bf} 'GBK' => '{00-80}
ed{80-9f}{80-bf} {81-fe}{40-7e,80-fe}',
{ee-ef}{80-bf}{80-bf} 'GB18030' => '{00-7f}
f0{90-bf}{80-bf}{80-bf} {81-fe}{40-7e,80-fe}
{f1-f3}{80-bf}{80-bf}{80-bf} {81-fe}{30-39}{81-fe}{30-39}',
f4{80-8f}{80-bf}{80-bf} }
End
"GB18030" => <<-End, {
{00-7f} 'ISO-8859-1' => '1byte',
{81-fe}{40-7e,80-fe} 'ISO-8859-2' => '1byte',
{81-fe}{30-93}{81-fe}{30-93} 'ISO-8859-3' => '1byte',
End 'ISO-8859-4' => '1byte',
"UTF-16BE" => <<-End, 'ISO-8859-5' => '1byte',
{00-d7,e0-ff}{00-ff} 'ISO-8859-6' => '1byte',
{d8-db}{00-ff}{dc-df}{00-ff} 'ISO-8859-7' => '1byte',
End 'ISO-8859-8' => '1byte',
"UTF-16LE" => <<-End, 'ISO-8859-9' => '1byte',
{00-ff}{00-d7,e0-ff} 'ISO-8859-10' => '1byte',
{00-ff}{d8-db}{00-ff}{dc-df} 'ISO-8859-11' => '1byte',
End 'ISO-8859-13' => '1byte',
"UTF-32BE" => <<-End, 'ISO-8859-14' => '1byte',
0000{00-d7,e0-ff}{00-ff} 'ISO-8859-15' => '1byte',
00{01-10}{00-ff}{00-ff} 'Windows-31J' => 'Shift_JIS',
End }.each {|k, v|
"UTF-32LE" => <<-End, ValidEncoding[k] = ValidEncoding.fetch(v)
{00-ff}{00-d7,e0-ff}0000
{00-ff}{00-ff}{01-10}00
End
} }
def make_signature(filename, src) def make_signature(filename, src)
@ -528,7 +534,7 @@ if !force_mode && output_filename && File.readable?(output_filename)
end end
end end
} }
if old_signature == chk_signature if old_signature == chk_signature && File.mtime(__FILE__) < File.mtime(output_filename)
now = Time.now now = Time.now
File.utime(now, now, output_filename) File.utime(now, now, output_filename)
STDERR.puts "already up-to-date: #{output_filename}" if VERBOSE_MODE STDERR.puts "already up-to-date: #{output_filename}" if VERBOSE_MODE

View File

@ -188,7 +188,6 @@ transcode_loop(const unsigned char **in_pos, unsigned char **out_pos,
unsigned char next_byte; unsigned char next_byte;
int from_utf8 = my_transcoder->from_utf8; int from_utf8 = my_transcoder->from_utf8;
unsigned char *out_s = out_stop - my_transcoder->max_output + 1; unsigned char *out_s = out_stop - my_transcoder->max_output + 1;
rb_encoding *from_encoding = rb_enc_find(my_transcoder->from_encoding);
rb_encoding *to_encoding = rb_enc_find(my_transcoder->to_encoding); rb_encoding *to_encoding = rb_enc_find(my_transcoder->to_encoding);
while (in_p < in_stop) { while (in_p < in_stop) {
@ -280,10 +279,6 @@ transcode_loop(const unsigned char **in_pos, unsigned char **out_pos,
/* valid character in from encoding /* valid character in from encoding
* but no related character(s) in to encoding */ * but no related character(s) in to encoding */
/* todo: add more alternative behaviors */ /* todo: add more alternative behaviors */
{
int len = rb_enc_mbclen((const char *)char_start, (const char *)in_stop, from_encoding);
while (in_p < char_start + len) in_p++;
}
if (opt&UNDEF_IGNORE) { if (opt&UNDEF_IGNORE) {
continue; continue;
} }