* lib/rexml/source.rb: Move encoding detection code to base class.
* lib/rexml/encoding.rb: Remove needless encoding detection code. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@37365 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
This commit is contained in:
parent
7ba54654a5
commit
718813ca9b
@ -1,3 +1,8 @@
|
|||||||
|
Sun Oct 28 23:47:09 2012 Kouhei Sutou <kou@cozmixng.org>
|
||||||
|
|
||||||
|
* lib/rexml/source.rb: Move encoding detection code to base class.
|
||||||
|
* lib/rexml/encoding.rb: Remove needless encoding detection code.
|
||||||
|
|
||||||
Sun Oct 28 21:40:13 2012 Kouhei Sutou <kou@cozmixng.org>
|
Sun Oct 28 21:40:13 2012 Kouhei Sutou <kou@cozmixng.org>
|
||||||
|
|
||||||
* lib/rexml/parsers/baseparser.rb: Fix a bug that UTF-8 is used
|
* lib/rexml/parsers/baseparser.rb: Fix a bug that UTF-8 is used
|
||||||
|
@ -20,19 +20,6 @@ module REXML
|
|||||||
true
|
true
|
||||||
end
|
end
|
||||||
|
|
||||||
def check_encoding(xml)
|
|
||||||
# We have to recognize UTF-16BE, UTF-16LE, and UTF-8
|
|
||||||
if xml[0, 2] == "\xfe\xff"
|
|
||||||
xml[0, 2] = ""
|
|
||||||
return 'UTF-16BE'
|
|
||||||
elsif xml[0, 2] == "\xff\xfe"
|
|
||||||
xml[0, 2] = ""
|
|
||||||
return 'UTF-16LE'
|
|
||||||
end
|
|
||||||
xml =~ /^\s*<\?xml\s+version\s*=\s*(['"]).*?\1\s+encoding\s*=\s*(["'])(.*?)\2/m
|
|
||||||
return $3 ? $3.upcase : 'UTF-8'
|
|
||||||
end
|
|
||||||
|
|
||||||
def encode(string)
|
def encode(string)
|
||||||
string.encode(@encoding)
|
string.encode(@encoding)
|
||||||
end
|
end
|
||||||
|
@ -43,7 +43,7 @@ module REXML
|
|||||||
if encoding
|
if encoding
|
||||||
self.encoding = encoding
|
self.encoding = encoding
|
||||||
else
|
else
|
||||||
self.encoding = check_encoding( @buffer )
|
detect_encoding
|
||||||
end
|
end
|
||||||
@line = 0
|
@line = 0
|
||||||
end
|
end
|
||||||
@ -53,14 +53,7 @@ module REXML
|
|||||||
# Overridden to support optimized en/decoding
|
# Overridden to support optimized en/decoding
|
||||||
def encoding=(enc)
|
def encoding=(enc)
|
||||||
return unless super
|
return unless super
|
||||||
@line_break = encode( '>' )
|
encoding_updated
|
||||||
if @encoding != 'UTF-8'
|
|
||||||
@buffer = decode(@buffer)
|
|
||||||
@to_utf = true
|
|
||||||
else
|
|
||||||
@to_utf = false
|
|
||||||
@buffer.force_encoding ::Encoding::UTF_8
|
|
||||||
end
|
|
||||||
end
|
end
|
||||||
|
|
||||||
# Scans the source for a given pattern. Note, that this is not your
|
# Scans the source for a given pattern. Note, that this is not your
|
||||||
@ -125,6 +118,38 @@ module REXML
|
|||||||
res = res[-1] if res.kind_of? Array
|
res = res[-1] if res.kind_of? Array
|
||||||
lines.index( res ) if res
|
lines.index( res ) if res
|
||||||
end
|
end
|
||||||
|
|
||||||
|
private
|
||||||
|
def detect_encoding
|
||||||
|
buffer_encoding = @buffer.encoding
|
||||||
|
detected_encoding = "UTF-8"
|
||||||
|
begin
|
||||||
|
@buffer.force_encoding("ASCII-8BIT")
|
||||||
|
if @buffer[0, 2] == "\xfe\xff"
|
||||||
|
@buffer[0, 2] = ""
|
||||||
|
detected_encoding = "UTF-16BE"
|
||||||
|
elsif @buffer[0, 2] == "\xff\xfe"
|
||||||
|
@buffer[0, 2] = ""
|
||||||
|
detected_encoding = "UTF-16LE"
|
||||||
|
elsif @buffer[0, 3] == "\xef\xbb\xbf"
|
||||||
|
@buffer[0, 3] = ""
|
||||||
|
detected_encoding = "UTF-8"
|
||||||
|
end
|
||||||
|
ensure
|
||||||
|
@buffer.force_encoding(buffer_encoding)
|
||||||
|
end
|
||||||
|
self.encoding = detected_encoding
|
||||||
|
end
|
||||||
|
|
||||||
|
def encoding_updated
|
||||||
|
if @encoding != 'UTF-8'
|
||||||
|
@buffer = decode(@buffer)
|
||||||
|
@to_utf = true
|
||||||
|
else
|
||||||
|
@to_utf = false
|
||||||
|
@buffer.force_encoding ::Encoding::UTF_8
|
||||||
|
end
|
||||||
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
# A Source that wraps an IO. See the Source class for method
|
# A Source that wraps an IO. See the Source class for method
|
||||||
@ -136,46 +161,12 @@ module REXML
|
|||||||
def initialize(arg, block_size=500, encoding=nil)
|
def initialize(arg, block_size=500, encoding=nil)
|
||||||
@er_source = @source = arg
|
@er_source = @source = arg
|
||||||
@to_utf = false
|
@to_utf = false
|
||||||
|
@pending_buffer = nil
|
||||||
|
|
||||||
# Determining the encoding is a deceptively difficult issue to resolve.
|
|
||||||
# First, we check the first two bytes for UTF-16. Then we
|
|
||||||
# assume that the encoding is at least ASCII enough for the '>', and
|
|
||||||
# we read until we get one of those. This gives us the XML declaration,
|
|
||||||
# if there is one. If there isn't one, the file MUST be UTF-8, as per
|
|
||||||
# the XML spec. If there is one, we can determine the encoding from
|
|
||||||
# it.
|
|
||||||
if encoding
|
if encoding
|
||||||
super("", encoding)
|
super("", encoding)
|
||||||
else
|
else
|
||||||
need_super_with_line = false
|
super(@source.read(3) || "")
|
||||||
str = @source.read( 2 ) || ''
|
|
||||||
str.force_encoding("ASCII-8BIT")
|
|
||||||
if str[0, 2] == "\xfe\xff"
|
|
||||||
@source.binmode
|
|
||||||
@source.set_encoding("UTF-16BE")
|
|
||||||
super("", "UTF-16BE")
|
|
||||||
elsif str[0, 2] == "\xff\xfe"
|
|
||||||
@source.binmode
|
|
||||||
@source.set_encoding("UTF-16LE")
|
|
||||||
super("", "UTF-16LE")
|
|
||||||
elsif str[0, 2] == "\xef\xbb"
|
|
||||||
str += @source.read(1)
|
|
||||||
if str[2, 1] == "\xBF"
|
|
||||||
@source.set_encoding("UTF-8")
|
|
||||||
super("", "UTF-8")
|
|
||||||
else
|
|
||||||
need_super_with_line = true
|
|
||||||
end
|
|
||||||
else
|
|
||||||
need_super_with_line = true
|
|
||||||
end
|
|
||||||
if need_super_with_line
|
|
||||||
if @source.eof?
|
|
||||||
super(str)
|
|
||||||
else
|
|
||||||
super(str + @source.readline(">"))
|
|
||||||
end
|
|
||||||
end
|
|
||||||
end
|
end
|
||||||
|
|
||||||
if !@to_utf and
|
if !@to_utf and
|
||||||
@ -271,6 +262,14 @@ module REXML
|
|||||||
private
|
private
|
||||||
def readline
|
def readline
|
||||||
str = @source.readline(@line_break)
|
str = @source.readline(@line_break)
|
||||||
|
if @pending_buffer
|
||||||
|
if str.nil?
|
||||||
|
str = @pending_buffer
|
||||||
|
else
|
||||||
|
str = @pending_buffer + str
|
||||||
|
end
|
||||||
|
@pending_buffer = nil
|
||||||
|
end
|
||||||
return nil if str.nil?
|
return nil if str.nil?
|
||||||
|
|
||||||
if @to_utf
|
if @to_utf
|
||||||
@ -280,5 +279,17 @@ module REXML
|
|||||||
str
|
str
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
def encoding_updated
|
||||||
|
case @encoding
|
||||||
|
when "UTF-16BE", "UTF-16LE"
|
||||||
|
@source.binmode
|
||||||
|
@source.set_encoding(@encoding)
|
||||||
|
end
|
||||||
|
@line_break = encode(">")
|
||||||
|
@pending_buffer, @buffer = @buffer, ""
|
||||||
|
@pending_buffer.force_encoding(@encoding)
|
||||||
|
super
|
||||||
|
end
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
Loading…
x
Reference in New Issue
Block a user