* string.c (enc_strlen): move UTF-8 optimization from str_strlen to

enc_strlen. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@45617 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2014-04-18 06:42:51 +00:00 · 2014-04-18 06:42:51 +00:00 · 0ab9abe069
commit 0ab9abe069
parent b97de74ee6
2 changed files with 66 additions and 61 deletions
--- a/5
+++ b/5
@ -1,3 +1,8 @@
+Fri Apr 18 14:21:21 2014  NARUSE, Yui  <naruse@ruby-lang.org>
+
+	* string.c (enc_strlen): move UTF-8 optimization from str_strlen to
+	  enc_strlen.
+
 Fri Apr 18 08:50:18 2014  Nobuyoshi Nakada  <nobu@ruby-lang.org>

 	* configure.in (rb_cv_getcwd_malloc): check if getcwd allocates
--- a/string.c
+++ b/string.c
@ -1075,6 +1075,41 @@ rb_str_init(int argc, VALUE *argv, VALUE str)
    return str;
 }

+#ifdef NONASCII_MASK
+#define is_utf8_lead_byte(c) (((c)&0xC0) != 0x80)
+
+/*
+ * UTF-8 leading bytes have either 0xxxxxxx or 11xxxxxx
+ * bit representation. (see http://en.wikipedia.org/wiki/UTF-8)
+ * Therefore, following pseudo code can detect UTF-8 leading byte.
+ *
+ * if (!(byte & 0x80))
+ *   byte |= 0x40;          // turn on bit6
+ * return ((byte>>6) & 1);  // bit6 represent it's leading byte or not.
+ *
+ * This function calculate every bytes in the argument word `s'
+ * using the above logic concurrently. and gather every bytes result.
+ */
+static inline VALUE
+count_utf8_lead_bytes_with_word(const VALUE *s)
+{
+    VALUE d = *s;
+
+    /* Transform into bit0 represent UTF-8 leading or not. */
+    d |= ~(d>>1);
+    d >>= 6;
+    d &= NONASCII_MASK >> 7;
+
+    /* Gather every bytes. */
+    d += (d>>8);
+    d += (d>>16);
+#if SIZEOF_VALUE == 8
+    d += (d>>32);
+#endif
+    return (d&0xF);
+}
+#endif
+
 static inline long
 enc_strlen(const char *p, const char *e, rb_encoding *enc, int cr)
 {
@ -1084,6 +1119,31 @@ enc_strlen(const char *p, const char *e, rb_encoding *enc, int cr)
    if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
        return (e - p + rb_enc_mbminlen(enc) - 1) / rb_enc_mbminlen(enc);
    }
+#ifdef NONASCII_MASK
+    else if (cr == ENC_CODERANGE_VALID && enc == rb_utf8_encoding()) {
+	VALUE len = 0;
+	if ((int)sizeof(VALUE) * 2 < e - p) {
+	    const VALUE *s, *t;
+	    const VALUE lowbits = sizeof(VALUE) - 1;
+	    s = (const VALUE*)(~lowbits & ((VALUE)p + lowbits));
+	    t = (const VALUE*)(~lowbits & (VALUE)e);
+	    while (p < (const char *)s) {
+		if (is_utf8_lead_byte(*p)) len++;
+		p++;
+	    }
+	    while (s < t) {
+		len += count_utf8_lead_bytes_with_word(s);
+		s++;
+	    }
+	    p = (const char *)s;
+	}
+	while (p < e) {
+	    if (is_utf8_lead_byte(*p)) len++;
+	    p++;
+	}
+	return (long)len;
+    }
+#endif
    else if (rb_enc_asciicompat(enc)) {
        c = 0;
 	if (cr == ENC_CODERANGE_7BIT || cr == ENC_CODERANGE_VALID) {
@ -1183,41 +1243,7 @@ rb_enc_strlen_cr(const char *p, const char *e, rb_encoding *enc, int *cr)
    return c;
 }

-#ifdef NONASCII_MASK
-#define is_utf8_lead_byte(c) (((c)&0xC0) != 0x80)
-
-/*
- * UTF-8 leading bytes have either 0xxxxxxx or 11xxxxxx
- * bit representation. (see http://en.wikipedia.org/wiki/UTF-8)
- * Therefore, following pseudo code can detect UTF-8 leading byte.
- *
- * if (!(byte & 0x80))
- *   byte |= 0x40;          // turn on bit6
- * return ((byte>>6) & 1);  // bit6 represent it's leading byte or not.
- *
- * This function calculate every bytes in the argument word `s'
- * using the above logic concurrently. and gather every bytes result.
- */
-static inline VALUE
-count_utf8_lead_bytes_with_word(const VALUE *s)
-{
-    VALUE d = *s;
-
-    /* Transform into bit0 represent UTF-8 leading or not. */
-    d |= ~(d>>1);
-    d >>= 6;
-    d &= NONASCII_MASK >> 7;
-
-    /* Gather every bytes. */
-    d += (d>>8);
-    d += (d>>16);
-#if SIZEOF_VALUE == 8
-    d += (d>>32);
-#endif
-    return (d&0xF);
-}
-#endif
-
+/* enc must be compatible with str's enc */
 static long
 str_strlen(VALUE str, rb_encoding *enc)
 {
@ -1230,33 +1256,7 @@ str_strlen(VALUE str, rb_encoding *enc)
    p = RSTRING_PTR(str);
    e = RSTRING_END(str);
    cr = ENC_CODERANGE(str);
-#ifdef NONASCII_MASK
-    if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID &&
-        enc == rb_utf8_encoding()) {

-	VALUE len = 0;
-	if ((int)sizeof(VALUE) * 2 < e - p) {
-	    const VALUE *s, *t;
-	    const VALUE lowbits = sizeof(VALUE) - 1;
-	    s = (const VALUE*)(~lowbits & ((VALUE)p + lowbits));
-	    t = (const VALUE*)(~lowbits & (VALUE)e);
-	    while (p < (const char *)s) {
-		if (is_utf8_lead_byte(*p)) len++;
-		p++;
-	    }
-	    while (s < t) {
-		len += count_utf8_lead_bytes_with_word(s);
-		s++;
-	    }
-	    p = (const char *)s;
-	}
-	while (p < e) {
-	    if (is_utf8_lead_byte(*p)) len++;
-	    p++;
-	}
-	return (long)len;
-    }
-#endif
    n = rb_enc_strlen_cr(p, e, enc, &cr);
    if (cr) {
        ENC_CODERANGE_SET(str, cr);