Optimize CGI.escapeHTML by reducing buffer extension

and switch-case branches. Buffer allocation optimization using `ALLOCA_N` would be the main benefit of patch. It eliminates the O(N) buffer extensions. It also reduces the number of branches using escape table like https://mattn.kaoriya.net/software/lang/c/20160817011915.htm. Closes: https://github.com/ruby/ruby/pull/2226 Co-authored-by: Nobuyoshi Nakada <nobu@ruby-lang.org> Co-authored-by: Yasuhiro MATSUMOTO <mattn.jp@gmail.com>
2019-06-04 19:58:39 +09:00 · 2019-06-04 19:58:39 +09:00 · 8d81e59aa7
commit 8d81e59aa7
parent b31e1b4a7c
2 changed files with 72 additions and 48 deletions
--- a/benchmark/cgi_escape_html.yml
+++ b/benchmark/cgi_escape_html.yml
@ -0,0 +1,40 @@
 prelude: require 'cgi/escape'
 benchmark:
  - name: escape_html_blank
    prelude: str = ""
    script: CGI.escapeHTML(str)
    loop_count: 20000000
  - name: escape_html_short_none
    prelude: str = "abcde"
    script: CGI.escapeHTML(str)
    loop_count: 20000000
  - name: escape_html_short_one
    prelude: str = "abcd<"
    script: CGI.escapeHTML(str)
    loop_count: 20000000
  - name: escape_html_short_all
    prelude: str = "'&\"<>"
    script: CGI.escapeHTML(str)
    loop_count: 5000000
  - name: escape_html_long_none
    prelude: str = "abcde" * 300
    script: CGI.escapeHTML(str)
    loop_count: 1000000
  - name: escape_html_long_all
    prelude: str = "'&\"<>" * 10
    script: CGI.escapeHTML(str)
    loop_count: 1000000
  - name: escape_html_real
    prelude: | # http://example.com/
      str = <<~HTML
        <body>
        <div>
            <h1>Example Domain</h1>
            <p>This domain is established to be used for illustrative examples in documents. You may use this
            domain in examples without prior coordination or asking for permission.</p>
            <p><a href="http://www.iana.org/domains/example">More information...</a></p>
        </div>
        </body>
      HTML
    script: CGI.escapeHTML(str)
    loop_count: 1000000
--- a/ext/cgi/escape/escape.c
+++ b/ext/cgi/escape/escape.c
@ -11,27 +11,20 @@ RUBY_EXTERN const signed char ruby_digit36_to_number_table[];
 static VALUE rb_cCGI, rb_mUtil, rb_mEscape;
 static ID id_accept_charset;
-static void
+#define HTML_ESCAPE_MAX_LEN 6
-html_escaped_cat(VALUE str, char c)
+
-{
+static const struct {
-    switch (c) {
+    uint8_t len;
-      case '\'':
+    char str[HTML_ESCAPE_MAX_LEN+1];
-	rb_str_cat_cstr(str, "&#39;");
+} html_escape_table[UCHAR_MAX+1] = {
-	break;
+#define HTML_ESCAPE(c, str) [c] = {rb_strlen_lit(str), str}
-      case '&':
+    HTML_ESCAPE('\'', "&#39;"),
-	rb_str_cat_cstr(str, "&amp;");
+    HTML_ESCAPE('&', "&amp;"),
-	break;
+    HTML_ESCAPE('"', "&quot;"),
-      case '"':
+    HTML_ESCAPE('<', "&lt;"),
-	rb_str_cat_cstr(str, "&quot;");
+    HTML_ESCAPE('>', "&gt;"),
-	break;
+#undef HTML_ESCAPE
-      case '<':
+};
 	rb_str_cat_cstr(str, "&lt;");
 	break;
      case '>':
 	rb_str_cat_cstr(str, "&gt;");
 	break;
    }
 }
 static inline void
 preserve_original_state(VALUE orig, VALUE dest)
@ -44,36 +37,27 @@ preserve_original_state(VALUE orig, VALUE dest)
 static VALUE
 optimized_escape_html(VALUE str)
 {
-    long i, len, beg = 0;
+    const char *cstr = RSTRING_PTR(str);
-    VALUE dest = 0;
+    const char *end = cstr + RSTRING_LEN(str);
-    const char *cstr;
+    char *buf = ALLOCA_N(char, RSTRING_LEN(str) * HTML_ESCAPE_MAX_LEN);
-    len  = RSTRING_LEN(str);
+    char *dest = buf;
-    cstr = RSTRING_PTR(str);
+    while (cstr < end) {
-
+        const unsigned char c = *cstr++;
-    for (i = 0; i < len; i++) {
+        uint8_t len = html_escape_table[c].len;
-	switch (cstr[i]) {
+        if (len) {
-	  case '\'':
+            memcpy(dest, html_escape_table[c].str, len);
-	  case '&':
+            dest += len;
-	  case '"':
+        }
-	  case '<':
+        else {
-	  case '>':
+            *dest++ = c;
-	    if (!dest) {
+        }
 		dest = rb_str_buf_new(len);
 	    }
 	    rb_str_cat(dest, cstr + beg, i - beg);
 	    beg = i + 1;
 	    html_escaped_cat(dest, cstr[i]);
 	    break;
 	}
    }
-    if (dest) {
+    if (RSTRING_LEN(str) < (dest - buf)) {
-	rb_str_cat(dest, cstr + beg, len - beg);
+        VALUE escaped = rb_str_new(buf, dest - buf);
-	preserve_original_state(str, dest);
+        preserve_original_state(str, escaped);
-	return dest;
+        return escaped;
    }
    else {
 	return rb_str_dup(str);