* lib/cgi.rb (CGI::unescapeHTML): more encoding sensible unescaping.

git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@18798 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2008-08-23 21:40:59 +00:00 · 2008-08-23 21:40:59 +00:00 · 29449d70be
commit 29449d70be
parent 33dd0c35f1
2 changed files with 34 additions and 13 deletions
--- a/4
+++ b/4
@ -1,3 +1,7 @@
 Sun Aug 24 06:39:05 2008  NARUSE, Yui  <naruse@ruby-lang.org>
 	* lib/cgi.rb (CGI::unescapeHTML): more encoding sensible unescaping.
 Sun Aug 24 04:23:19 2008  NARUSE, Yui  <naruse@ruby-lang.org>
 	* encoding.c (enc_compatible_p): raise TypeError when argument is Encoding.
--- a/lib/cgi.rb
+++ b/lib/cgi.rb
@ -375,6 +375,19 @@ class CGI
  #      # => "Usage: foo \"bar\" <baz>"
  def CGI::unescapeHTML(string)
    enc = string.encoding
    if [Encoding::UTF_16BE, Encoding::UTF_16LE, Encoding::UTF_32BE, Encoding::UTF_32LE].include?(enc)
      return string.gsub(Regexp.new('&(amp|quot|gt|lt|#[0-9]+|#x[0-9A-Fa-f]+);'.encode(enc))) do
 	case $1.encode("US-ASCII")
 	when 'amp'                 then '&'.encode(enc)
 	when 'quot'                then '"'.encode(enc)
 	when 'gt'                  then '>'.encode(enc)
 	when 'lt'                  then '<'.encode(enc)
 	when /\A#0*(\d+)\z/        then $1.to_i.chr(enc)
 	when /\A#x([0-9a-f]+)\z/i  then $1.hex.chr(enc)
 	end
      end
    end
    asciicompat = Encoding.compatible?(string, "a")
    string.gsub(/&(amp|quot|gt|lt|\#[0-9]+|\#x[0-9A-Fa-f]+);/) do
      match = $1.dup
      case match
@ -382,15 +395,19 @@ class CGI
      when 'quot'                then '"'
      when 'gt'                  then '>'
      when 'lt'                  then '<'
-      when /\A#0*(\d+)\z/        then
+      when /\A#0*(\d+)\z/
-        if Integer($1) < 256
+	if enc == Encoding::UTF_8
-          Integer($1).chr.force_encoding(enc)
+	  $1.to_i.chr(enc)
 	elsif $1.to_i < 128 && asciicompat
 	  $1.to_i.chr
 	else
 	  "&##{$1};"
 	end
-      when /\A#x([0-9a-f]+)\z/i then
+      when /\A#x([0-9a-f]+)\z/i
-        if $1.hex < 256
+	if enc == Encoding::UTF_8
-          $1.hex.chr.force_encoding(enc)
+	  $1.hex.chr(enc)
 	elsif $1.hex < 128 && asciicompat
 	  $1.hex.chr
 	else
 	  "&#x#{$1};"
 	end