Support require 'cgi/escape' with extracting CGI::Escape from CGI::Util

Merged: https://github.com/ruby/ruby/pull/13275
2025-05-08 19:08:11 +09:00 · 2025-05-08 19:08:11 +09:00 · 8a1d45144b · 2025-05-09 05:27:44 +00:00
commit 8a1d45144b
parent c667683768
6 changed files with 251 additions and 240 deletions
--- a/ext/cgi/escape/escape.c
+++ b/ext/cgi/escape/escape.c
@ -8,7 +8,7 @@ RUBY_EXTERN const signed char ruby_digit36_to_number_table[];
 #define upper_hexdigits (ruby_hexdigits+16)
 #define char_to_number(c) ruby_digit36_to_number_table[(unsigned char)(c)]

-static VALUE rb_cCGI, rb_mUtil, rb_mEscape;
+static VALUE rb_cCGI, rb_mEscape, rb_mEscapeExt;
 static ID id_accept_charset;

 #define HTML_ESCAPE_MAX_LEN 6
@ -471,17 +471,17 @@ Init_escape(void)
 void
 InitVM_escape(void)
 {
-    rb_cCGI    = rb_define_class("CGI", rb_cObject);
-    rb_mEscape = rb_define_module_under(rb_cCGI, "Escape");
-    rb_mUtil   = rb_define_module_under(rb_cCGI, "Util");
-    rb_define_method(rb_mEscape, "escapeHTML", cgiesc_escape_html, 1);
-    rb_define_method(rb_mEscape, "unescapeHTML", cgiesc_unescape_html, 1);
-    rb_define_method(rb_mEscape, "escapeURIComponent", cgiesc_escape_uri_component, 1);
-    rb_define_alias(rb_mEscape, "escape_uri_component", "escapeURIComponent");
-    rb_define_method(rb_mEscape, "unescapeURIComponent", cgiesc_unescape_uri_component, -1);
-    rb_define_alias(rb_mEscape, "unescape_uri_component", "unescapeURIComponent");
-    rb_define_method(rb_mEscape, "escape", cgiesc_escape, 1);
-    rb_define_method(rb_mEscape, "unescape", cgiesc_unescape, -1);
-    rb_prepend_module(rb_mUtil, rb_mEscape);
-    rb_extend_object(rb_cCGI, rb_mEscape);
+    rb_cCGI       = rb_define_class("CGI", rb_cObject);
+    rb_mEscapeExt = rb_define_module_under(rb_cCGI, "EscapeExt");
+    rb_mEscape    = rb_define_module_under(rb_cCGI, "Escape");
+    rb_define_method(rb_mEscapeExt, "escapeHTML", cgiesc_escape_html, 1);
+    rb_define_method(rb_mEscapeExt, "unescapeHTML", cgiesc_unescape_html, 1);
+    rb_define_method(rb_mEscapeExt, "escapeURIComponent", cgiesc_escape_uri_component, 1);
+    rb_define_alias(rb_mEscapeExt, "escape_uri_component", "escapeURIComponent");
+    rb_define_method(rb_mEscapeExt, "unescapeURIComponent", cgiesc_unescape_uri_component, -1);
+    rb_define_alias(rb_mEscapeExt, "unescape_uri_component", "unescapeURIComponent");
+    rb_define_method(rb_mEscapeExt, "escape", cgiesc_escape, 1);
+    rb_define_method(rb_mEscapeExt, "unescape", cgiesc_unescape, -1);
+    rb_prepend_module(rb_mEscape, rb_mEscapeExt);
+    rb_extend_object(rb_cCGI, rb_mEscapeExt);
 }
--- a/lib/cgi.rb
+++ b/lib/cgi.rb
@ -294,4 +294,5 @@ end
 require 'cgi/core'
 require 'cgi/cookie'
 require 'cgi/util'
+require 'cgi/escape'
 CGI.autoload(:HtmlExtension, 'cgi/html')
--- a/lib/cgi/core.rb
+++ b/lib/cgi/core.rb
@ -4,12 +4,12 @@
 # generating HTTP responses.
 #++
 class CGI
-  unless const_defined?(:Util)
-    module Util
+  unless const_defined?(:Escape)
+    module Escape
      @@accept_charset = "UTF-8" # :nodoc:
    end
-    include Util
-    extend Util
+    include Escape
+    extend Escape
  end

  $CGI_ENV = ENV    # for FCGI support
--- a/lib/cgi/escape.rb
+++ b/lib/cgi/escape.rb
@ -0,0 +1,224 @@
+# frozen_string_literal: true
+
+class CGI
+  module Escape; end
+  include Escape
+  extend Escape
+end
+
+module CGI::Escape
+  @@accept_charset = Encoding::UTF_8 unless defined?(@@accept_charset)
+
+  # URL-encode a string into application/x-www-form-urlencoded.
+  # Space characters (+" "+) are encoded with plus signs (+"+"+)
+  #   url_encoded_string = CGI.escape("'Stop!' said Fred")
+  #      # => "%27Stop%21%27+said+Fred"
+  def escape(string)
+    encoding = string.encoding
+    buffer = string.b
+    buffer.gsub!(/([^ a-zA-Z0-9_.\-~]+)/) do |m|
+      '%' + m.unpack('H2' * m.bytesize).join('%').upcase
+    end
+    buffer.tr!(' ', '+')
+    buffer.force_encoding(encoding)
+  end
+
+  # URL-decode an application/x-www-form-urlencoded string with encoding(optional).
+  #   string = CGI.unescape("%27Stop%21%27+said+Fred")
+  #      # => "'Stop!' said Fred"
+  def unescape(string, encoding = @@accept_charset)
+    str = string.tr('+', ' ')
+    str = str.b
+    str.gsub!(/((?:%[0-9a-fA-F]{2})+)/) do |m|
+      [m.delete('%')].pack('H*')
+    end
+    str.force_encoding(encoding)
+    str.valid_encoding? ? str : str.force_encoding(string.encoding)
+  end
+
+  # URL-encode a string following RFC 3986
+  # Space characters (+" "+) are encoded with (+"%20"+)
+  #   url_encoded_string = CGI.escapeURIComponent("'Stop!' said Fred")
+  #      # => "%27Stop%21%27%20said%20Fred"
+  def escapeURIComponent(string)
+    encoding = string.encoding
+    buffer = string.b
+    buffer.gsub!(/([^a-zA-Z0-9_.\-~]+)/) do |m|
+      '%' + m.unpack('H2' * m.bytesize).join('%').upcase
+    end
+    buffer.force_encoding(encoding)
+  end
+  alias escape_uri_component escapeURIComponent
+
+  # URL-decode a string following RFC 3986 with encoding(optional).
+  #   string = CGI.unescapeURIComponent("%27Stop%21%27+said%20Fred")
+  #      # => "'Stop!'+said Fred"
+  def unescapeURIComponent(string, encoding = @@accept_charset)
+    str = string.b
+    str.gsub!(/((?:%[0-9a-fA-F]{2})+)/) do |m|
+      [m.delete('%')].pack('H*')
+    end
+    str.force_encoding(encoding)
+    str.valid_encoding? ? str : str.force_encoding(string.encoding)
+  end
+
+  alias unescape_uri_component unescapeURIComponent
+
+  # The set of special characters and their escaped values
+  TABLE_FOR_ESCAPE_HTML__ = {
+    "'" => '&#39;',
+    '&' => '&amp;',
+    '"' => '&quot;',
+    '<' => '&lt;',
+    '>' => '&gt;',
+  }
+
+  # Escape special characters in HTML, namely '&\"<>
+  #   CGI.escapeHTML('Usage: foo "bar" <baz>')
+  #      # => "Usage: foo &quot;bar&quot; &lt;baz&gt;"
+  def escapeHTML(string)
+    enc = string.encoding
+    unless enc.ascii_compatible?
+      if enc.dummy?
+        origenc = enc
+        enc = Encoding::Converter.asciicompat_encoding(enc)
+        string = enc ? string.encode(enc) : string.b
+      end
+      table = Hash[TABLE_FOR_ESCAPE_HTML__.map {|pair|pair.map {|s|s.encode(enc)}}]
+      string = string.gsub(/#{"['&\"<>]".encode(enc)}/, table)
+      string.encode!(origenc) if origenc
+      string
+    else
+      string = string.b
+      string.gsub!(/['&\"<>]/, TABLE_FOR_ESCAPE_HTML__)
+      string.force_encoding(enc)
+    end
+  end
+
+  # Unescape a string that has been HTML-escaped
+  #   CGI.unescapeHTML("Usage: foo &quot;bar&quot; &lt;baz&gt;")
+  #      # => "Usage: foo \"bar\" <baz>"
+  def unescapeHTML(string)
+    enc = string.encoding
+    unless enc.ascii_compatible?
+      if enc.dummy?
+        origenc = enc
+        enc = Encoding::Converter.asciicompat_encoding(enc)
+        string = enc ? string.encode(enc) : string.b
+      end
+      string = string.gsub(Regexp.new('&(apos|amp|quot|gt|lt|#[0-9]+|#x[0-9A-Fa-f]+);'.encode(enc))) do
+        case $1.encode(Encoding::US_ASCII)
+        when 'apos'                then "'".encode(enc)
+        when 'amp'                 then '&'.encode(enc)
+        when 'quot'                then '"'.encode(enc)
+        when 'gt'                  then '>'.encode(enc)
+        when 'lt'                  then '<'.encode(enc)
+        when /\A#0*(\d+)\z/        then $1.to_i.chr(enc)
+        when /\A#x([0-9a-f]+)\z/i  then $1.hex.chr(enc)
+        end
+      end
+      string.encode!(origenc) if origenc
+      return string
+    end
+    return string unless string.include? '&'
+    charlimit = case enc
+                when Encoding::UTF_8; 0x10ffff
+                when Encoding::ISO_8859_1; 256
+                else 128
+                end
+    string = string.b
+    string.gsub!(/&(apos|amp|quot|gt|lt|\#[0-9]+|\#[xX][0-9A-Fa-f]+);/) do
+      match = $1.dup
+      case match
+      when 'apos'                then "'"
+      when 'amp'                 then '&'
+      when 'quot'                then '"'
+      when 'gt'                  then '>'
+      when 'lt'                  then '<'
+      when /\A#0*(\d+)\z/
+        n = $1.to_i
+        if n < charlimit
+          n.chr(enc)
+        else
+          "&##{$1};"
+        end
+      when /\A#x([0-9a-f]+)\z/i
+        n = $1.hex
+        if n < charlimit
+          n.chr(enc)
+        else
+          "&#x#{$1};"
+        end
+      else
+        "&#{match};"
+      end
+    end
+    string.force_encoding enc
+  end
+
+  # Synonym for CGI.escapeHTML(str)
+  alias escape_html escapeHTML
+  alias h escapeHTML
+
+  # Synonym for CGI.unescapeHTML(str)
+  alias unescape_html unescapeHTML
+
+  # TruffleRuby runs the pure-Ruby variant faster, do not use the C extension there
+  unless RUBY_ENGINE == 'truffleruby'
+    begin
+      require 'cgi/escape.so'
+    rescue LoadError
+    end
+  end
+
+  # Escape only the tags of certain HTML elements in +string+.
+  #
+  # Takes an element or elements or array of elements.  Each element
+  # is specified by the name of the element, without angle brackets.
+  # This matches both the start and the end tag of that element.
+  # The attribute list of the open tag will also be escaped (for
+  # instance, the double-quotes surrounding attribute values).
+  #
+  #   print CGI.escapeElement('<BR><A HREF="url"></A>', "A", "IMG")
+  #     # "<BR>&lt;A HREF=&quot;url&quot;&gt;&lt;/A&gt"
+  #
+  #   print CGI.escapeElement('<BR><A HREF="url"></A>', ["A", "IMG"])
+  #     # "<BR>&lt;A HREF=&quot;url&quot;&gt;&lt;/A&gt"
+  def escapeElement(string, *elements)
+    elements = elements[0] if elements[0].kind_of?(Array)
+    unless elements.empty?
+      string.gsub(/<\/?(?:#{elements.join("|")})\b[^<>]*+>?/im) do
+        CGI.escapeHTML($&)
+      end
+    else
+      string
+    end
+  end
+
+  # Undo escaping such as that done by CGI.escapeElement()
+  #
+  #   print CGI.unescapeElement(
+  #           CGI.escapeHTML('<BR><A HREF="url"></A>'), "A", "IMG")
+  #     # "&lt;BR&gt;<A HREF="url"></A>"
+  #
+  #   print CGI.unescapeElement(
+  #           CGI.escapeHTML('<BR><A HREF="url"></A>'), ["A", "IMG"])
+  #     # "&lt;BR&gt;<A HREF="url"></A>"
+  def unescapeElement(string, *elements)
+    elements = elements[0] if elements[0].kind_of?(Array)
+    unless elements.empty?
+      string.gsub(/&lt;\/?(?:#{elements.join("|")})\b(?>[^&]+|&(?![gl]t;)\w+;)*(?:&gt;)?/im) do
+        unescapeHTML($&)
+      end
+    else
+      string
+    end
+  end
+
+  # Synonym for CGI.escapeElement(str)
+  alias escape_element escapeElement
+
+  # Synonym for CGI.unescapeElement(str)
+  alias unescape_element unescapeElement
+
+end
--- a/lib/cgi/util.rb
+++ b/lib/cgi/util.rb
@ -4,220 +4,8 @@ class CGI
  include Util
  extend Util
 end
+
 module CGI::Util
-  @@accept_charset = Encoding::UTF_8 unless defined?(@@accept_charset)
-
-  # URL-encode a string into application/x-www-form-urlencoded.
-  # Space characters (+" "+) are encoded with plus signs (+"+"+)
-  #   url_encoded_string = CGI.escape("'Stop!' said Fred")
-  #      # => "%27Stop%21%27+said+Fred"
-  def escape(string)
-    encoding = string.encoding
-    buffer = string.b
-    buffer.gsub!(/([^ a-zA-Z0-9_.\-~]+)/) do |m|
-      '%' + m.unpack('H2' * m.bytesize).join('%').upcase
-    end
-    buffer.tr!(' ', '+')
-    buffer.force_encoding(encoding)
-  end
-
-  # URL-decode an application/x-www-form-urlencoded string with encoding(optional).
-  #   string = CGI.unescape("%27Stop%21%27+said+Fred")
-  #      # => "'Stop!' said Fred"
-  def unescape(string, encoding = @@accept_charset)
-    str = string.tr('+', ' ')
-    str = str.b
-    str.gsub!(/((?:%[0-9a-fA-F]{2})+)/) do |m|
-      [m.delete('%')].pack('H*')
-    end
-    str.force_encoding(encoding)
-    str.valid_encoding? ? str : str.force_encoding(string.encoding)
-  end
-
-  # URL-encode a string following RFC 3986
-  # Space characters (+" "+) are encoded with (+"%20"+)
-  #   url_encoded_string = CGI.escapeURIComponent("'Stop!' said Fred")
-  #      # => "%27Stop%21%27%20said%20Fred"
-  def escapeURIComponent(string)
-    encoding = string.encoding
-    buffer = string.b
-    buffer.gsub!(/([^a-zA-Z0-9_.\-~]+)/) do |m|
-      '%' + m.unpack('H2' * m.bytesize).join('%').upcase
-    end
-    buffer.force_encoding(encoding)
-  end
-  alias escape_uri_component escapeURIComponent
-
-  # URL-decode a string following RFC 3986 with encoding(optional).
-  #   string = CGI.unescapeURIComponent("%27Stop%21%27+said%20Fred")
-  #      # => "'Stop!'+said Fred"
-  def unescapeURIComponent(string, encoding = @@accept_charset)
-    str = string.b
-    str.gsub!(/((?:%[0-9a-fA-F]{2})+)/) do |m|
-      [m.delete('%')].pack('H*')
-    end
-    str.force_encoding(encoding)
-    str.valid_encoding? ? str : str.force_encoding(string.encoding)
-  end
-
-  alias unescape_uri_component unescapeURIComponent
-
-  # The set of special characters and their escaped values
-  TABLE_FOR_ESCAPE_HTML__ = {
-    "'" => '&#39;',
-    '&' => '&amp;',
-    '"' => '&quot;',
-    '<' => '&lt;',
-    '>' => '&gt;',
-  }
-
-  # Escape special characters in HTML, namely '&\"<>
-  #   CGI.escapeHTML('Usage: foo "bar" <baz>')
-  #      # => "Usage: foo &quot;bar&quot; &lt;baz&gt;"
-  def escapeHTML(string)
-    enc = string.encoding
-    unless enc.ascii_compatible?
-      if enc.dummy?
-        origenc = enc
-        enc = Encoding::Converter.asciicompat_encoding(enc)
-        string = enc ? string.encode(enc) : string.b
-      end
-      table = Hash[TABLE_FOR_ESCAPE_HTML__.map {|pair|pair.map {|s|s.encode(enc)}}]
-      string = string.gsub(/#{"['&\"<>]".encode(enc)}/, table)
-      string.encode!(origenc) if origenc
-      string
-    else
-      string = string.b
-      string.gsub!(/['&\"<>]/, TABLE_FOR_ESCAPE_HTML__)
-      string.force_encoding(enc)
-    end
-  end
-
-  # TruffleRuby runs the pure-Ruby variant faster, do not use the C extension there
-  unless RUBY_ENGINE == 'truffleruby'
-    begin
-      require 'cgi/escape'
-    rescue LoadError
-    end
-  end
-
-  # Unescape a string that has been HTML-escaped
-  #   CGI.unescapeHTML("Usage: foo &quot;bar&quot; &lt;baz&gt;")
-  #      # => "Usage: foo \"bar\" <baz>"
-  def unescapeHTML(string)
-    enc = string.encoding
-    unless enc.ascii_compatible?
-      if enc.dummy?
-        origenc = enc
-        enc = Encoding::Converter.asciicompat_encoding(enc)
-        string = enc ? string.encode(enc) : string.b
-      end
-      string = string.gsub(Regexp.new('&(apos|amp|quot|gt|lt|#[0-9]+|#x[0-9A-Fa-f]+);'.encode(enc))) do
-        case $1.encode(Encoding::US_ASCII)
-        when 'apos'                then "'".encode(enc)
-        when 'amp'                 then '&'.encode(enc)
-        when 'quot'                then '"'.encode(enc)
-        when 'gt'                  then '>'.encode(enc)
-        when 'lt'                  then '<'.encode(enc)
-        when /\A#0*(\d+)\z/        then $1.to_i.chr(enc)
-        when /\A#x([0-9a-f]+)\z/i  then $1.hex.chr(enc)
-        end
-      end
-      string.encode!(origenc) if origenc
-      return string
-    end
-    return string unless string.include? '&'
-    charlimit = case enc
-                when Encoding::UTF_8; 0x10ffff
-                when Encoding::ISO_8859_1; 256
-                else 128
-                end
-    string = string.b
-    string.gsub!(/&(apos|amp|quot|gt|lt|\#[0-9]+|\#[xX][0-9A-Fa-f]+);/) do
-      match = $1.dup
-      case match
-      when 'apos'                then "'"
-      when 'amp'                 then '&'
-      when 'quot'                then '"'
-      when 'gt'                  then '>'
-      when 'lt'                  then '<'
-      when /\A#0*(\d+)\z/
-        n = $1.to_i
-        if n < charlimit
-          n.chr(enc)
-        else
-          "&##{$1};"
-        end
-      when /\A#x([0-9a-f]+)\z/i
-        n = $1.hex
-        if n < charlimit
-          n.chr(enc)
-        else
-          "&#x#{$1};"
-        end
-      else
-        "&#{match};"
-      end
-    end
-    string.force_encoding enc
-  end
-
-  # Synonym for CGI.escapeHTML(str)
-  alias escape_html escapeHTML
-
-  # Synonym for CGI.unescapeHTML(str)
-  alias unescape_html unescapeHTML
-
-  # Escape only the tags of certain HTML elements in +string+.
-  #
-  # Takes an element or elements or array of elements.  Each element
-  # is specified by the name of the element, without angle brackets.
-  # This matches both the start and the end tag of that element.
-  # The attribute list of the open tag will also be escaped (for
-  # instance, the double-quotes surrounding attribute values).
-  #
-  #   print CGI.escapeElement('<BR><A HREF="url"></A>', "A", "IMG")
-  #     # "<BR>&lt;A HREF=&quot;url&quot;&gt;&lt;/A&gt"
-  #
-  #   print CGI.escapeElement('<BR><A HREF="url"></A>', ["A", "IMG"])
-  #     # "<BR>&lt;A HREF=&quot;url&quot;&gt;&lt;/A&gt"
-  def escapeElement(string, *elements)
-    elements = elements[0] if elements[0].kind_of?(Array)
-    unless elements.empty?
-      string.gsub(/<\/?(?:#{elements.join("|")})\b[^<>]*+>?/im) do
-        CGI.escapeHTML($&)
-      end
-    else
-      string
-    end
-  end
-
-  # Undo escaping such as that done by CGI.escapeElement()
-  #
-  #   print CGI.unescapeElement(
-  #           CGI.escapeHTML('<BR><A HREF="url"></A>'), "A", "IMG")
-  #     # "&lt;BR&gt;<A HREF="url"></A>"
-  #
-  #   print CGI.unescapeElement(
-  #           CGI.escapeHTML('<BR><A HREF="url"></A>'), ["A", "IMG"])
-  #     # "&lt;BR&gt;<A HREF="url"></A>"
-  def unescapeElement(string, *elements)
-    elements = elements[0] if elements[0].kind_of?(Array)
-    unless elements.empty?
-      string.gsub(/&lt;\/?(?:#{elements.join("|")})\b(?>[^&]+|&(?![gl]t;)\w+;)*(?:&gt;)?/im) do
-        unescapeHTML($&)
-      end
-    else
-      string
-    end
-  end
-
-  # Synonym for CGI.escapeElement(str)
-  alias escape_element escapeElement
-
-  # Synonym for CGI.unescapeElement(str)
-  alias unescape_element unescapeElement
-
  # Format a +Time+ object as a String using the format specified by RFC 1123.
  #
  #   CGI.rfc1123_date(Time.now)
@ -253,6 +41,4 @@ module CGI::Util
    end
    lines.gsub(/^((?:#{Regexp::quote(shift)})*)__(?=<\/?\w)/, '\1')
  end
-
-  alias h escapeHTML
 end
--- a/test/cgi/test_cgi_util.rb
+++ b/test/cgi/test_cgi_util.rb
@ -6,7 +6,7 @@ require_relative 'update_env'


 class CGIUtilTest < Test::Unit::TestCase
-  include CGI::Util
+  include CGI::Escape
  include UpdateEnv

  def setup
@ -63,7 +63,7 @@ class CGIUtilTest < Test::Unit::TestCase
    return unless defined?(::Encoding)

    assert_raise(TypeError) {CGI.unescape('', nil)}
-    assert_separately(%w[-rcgi/util], "#{<<-"begin;"}\n#{<<-"end;"}")
+    assert_separately(%w[-rcgi/escape], "#{<<-"begin;"}\n#{<<-"end;"}")
    begin;
      assert_equal("", CGI.unescape(''))
    end;
@ -120,7 +120,7 @@ class CGIUtilTest < Test::Unit::TestCase
    return unless defined?(::Encoding)

    assert_raise(TypeError) {CGI.unescapeURIComponent('', nil)}
-    assert_separately(%w[-rcgi/util], "#{<<-"begin;"}\n#{<<-"end;"}")
+    assert_separately(%w[-rcgi/escape], "#{<<-"begin;"}\n#{<<-"end;"}")
    begin;
      assert_equal("", CGI.unescapeURIComponent(''))
    end;
@ -300,21 +300,21 @@ end

 class CGIUtilPureRubyTest < Test::Unit::TestCase
  def setup
-    CGI::Escape.module_eval do
+    CGI::EscapeExt.module_eval do
      alias _escapeHTML escapeHTML
      remove_method :escapeHTML
      alias _unescapeHTML unescapeHTML
      remove_method :unescapeHTML
-    end if defined?(CGI::Escape)
+    end if defined?(CGI::EscapeExt)
  end

  def teardown
-    CGI::Escape.module_eval do
+    CGI::EscapeExt.module_eval do
      alias escapeHTML _escapeHTML
      remove_method :_escapeHTML
      alias unescapeHTML _unescapeHTML
      remove_method :_unescapeHTML
-    end if defined?(CGI::Escape)
+    end if defined?(CGI::EscapeExt)
  end

  include CGIUtilTest::UnescapeHTMLTests