* lib/csv.rb: Incorporating the fixes from the recent

FasterCSV releases: 1.5.2 and 1.5.3. [ruby-core:25038] git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@27025 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2010-03-23 14:59:25 +00:00 · 2010-03-23 14:59:25 +00:00 · 1bd1128989
commit 1bd1128989
parent 0471422beb
5 changed files with 115 additions and 89 deletions
--- a/5
+++ b/5
@ -1,3 +1,8 @@
 Tue Mar 23 23:58:51 2010  James Edward Gray II  <jeg2@ruby-lang.org>
 	* lib/csv.rb: Incorporating the fixes from the recent
 	  FasterCSV releases:  1.5.2 and 1.5.3.  [ruby-core:25038]
 Tue Mar 23 18:35:46 2010  Shugo Maeda  <shugo@ruby-lang.org>
 	* object.c (rb_obj_singleton_class): new method
--- a/lib/csv.rb
+++ b/lib/csv.rb
@ -826,8 +826,12 @@ class CSV
    # Returns the table as a complete CSV String.  Headers will be listed first,
    # then all of the field rows.
    #
    # This method assumes you want the Table.headers(), unless you explicitly
    # pass <tt>:write_headers => false</tt>.
    # 
    def to_csv(options = Hash.new)
-      @table.inject([headers.to_csv(options)]) do |rows, row|
+      wh = options.fetch(:write_headers, true)
      @table.inject(wh ? [headers.to_csv(options)] : [ ]) do |rows, row|
        if row.header_row?
          rows
        else
@ -1808,13 +1812,18 @@ class CSV
    # it can take multiple calls to <tt>@io.gets()</tt> to get a full line,
    # because of \r and/or \n characters embedded in quoted fields
    #
    in_extended_col = false
    csv             = Array.new
    loop do
      # add another read to the line
-      (line += @io.gets(@row_sep)) rescue return nil
+      unless parse = @io.gets(@row_sep)
-      # copy the line so we can chop it up in parsing
+        return nil
-      parse =  line.dup
+      end
      parse.sub!(@parsers[:line_end], "")
      if csv.empty?
        #
        # I believe a blank line should be an <tt>Array.new</tt>, not Ruby 1.8
        # CSV's <tt>[nil]</tt>
@ -1822,7 +1831,6 @@ class CSV
        if parse.empty?
          @lineno += 1
          if @skip_blanks
          line = ""
            next
          elsif @unconverted_fields
            return add_unconverted_fields(Array.new, Array.new)
@ -1832,42 +1840,67 @@ class CSV
            return Array.new
          end
        end
      #
      # shave leading empty fields if needed, because the main parser chokes
      # on these
      #
      csv = if parse.sub!(@parsers[:leading_fields], "")
        [nil] * ($&.length / @col_sep.length)
      else
        Array.new
      end
-      #
+
-      # then parse the main fields with a hyper-tuned Regexp from
+      parts =  parse.split(@col_sep, -1)
-      # Mastering Regular Expressions, Second Edition
+      csv   << nil if parts.empty?
-      #
+
-      parse.gsub!(@parsers[:csv_row]) do
+      # This loop is the hot path of csv parsing. Some things may be non-dry
-        csv << if $1.nil?     # we found an unquoted field
+      # for a reason. Make sure to benchmark when refactoring.
-          if $2.empty?        # switch empty unquoted fields to +nil+...
+      parts.each do |part|
-            nil               # for Ruby 1.8 CSV compatibility
+        if in_extended_col
          # If we are continuing a previous column
          if part[-1] == @quote_char && part.count(@quote_char) % 2 != 0
            # extended column ends
            csv.last << part[0..-2]
            raise MalformedCSVError if csv.last =~ @parsers[:stray_quote]
            csv.last.gsub!(@quote_char * 2, @quote_char)
            in_extended_col = false
          else
-            # I decided to take a strict approach to CSV parsing...
+            csv.last << part
-            if $2.count(@parsers[:return_newline]).zero?  # verify correctness
+            csv.last << @col_sep
-              $2
+          end
        elsif part[0] == @quote_char
          # If we are staring a new quoted column
          if part[-1] != @quote_char || part.count(@quote_char) % 2 != 0
            # start an extended column
            csv             << part[1..-1]
            csv.last        << @col_sep
            in_extended_col =  true
          else
-              # or throw an Exception
+            # regular quoted column
            csv << part[1..-2]
            raise MalformedCSVError if csv.last =~ @parsers[:stray_quote]
            csv.last.gsub!(@quote_char * 2, @quote_char)
          end
        elsif part =~ @parsers[:quote_or_nl]
          # Unquoted field with bad characters.
          if part =~ @parsers[:nl_or_lf]
            raise MalformedCSVError, "Unquoted fields do not allow " +
                                     "\\r or \\n (line #{lineno + 1})."
          else
            raise MalformedCSVError, "Illegal quoting on line #{lineno + 1}."
          end
        else
          # Regular ole unquoted field.
          csv << (part.empty? ? nil : part)
        end
        else                  # we found a quoted field...
          $1.gsub(@quote_char * 2, @quote_char)  # unescape contents
        end
        ""  # gsub!'s replacement, clear the field
      end
-      # if parse is empty?(), we found all the fields on the line...
+      # Replace tacked on @col_sep with @row_sep if we are still in an extended
-      if parse.empty?
+      # column.
      csv[-1][-1] = @row_sep if in_extended_col
      if in_extended_col
        # if we're at eof?(), a quoted field wasn't closed...
        if @io.eof?
          raise MalformedCSVError,
                "Unclosed quoted field on line #{lineno + 1}."
        elsif @field_size_limit and csv.last.size >= @field_size_limit
          raise MalformedCSVError, "Field size exceeded on line #{lineno + 1}."
        end
        # otherwise, we need to loop and pull some more data to complete the row
      else
        @lineno += 1
        # save fields unconverted fields, if needed...
@ -1886,15 +1919,6 @@ class CSV
        # return the results
        break csv
      end
      # if we're not empty?() but at eof?(), a quoted field wasn't closed...
      if @io.eof?
        raise MalformedCSVError, "Unclosed quoted field on line #{lineno + 1}."
      elsif parse =~ @parsers[:bad_field]
        raise MalformedCSVError, "Illegal quoting on line #{lineno + 1}."
      elsif @field_size_limit and parse.length >= @field_size_limit
        raise MalformedCSVError, "Field size exceeded on line #{lineno + 1}."
      end
      # otherwise, we need to loop and pull some more data to complete the row
    end
  end
  alias_method :gets,     :shift
@ -2046,33 +2070,11 @@ class CSV
    esc_row_sep = escape_re(@row_sep)
    esc_quote   = escape_re(@quote_char)
    @parsers = {
-      # for empty leading fields
+      # for detecting parse errors
-      leading_fields: encode_re("\\A(?:", esc_col_sep, ")+"),
+      quote_or_nl:    encode_re("[", esc_quote, "\r\n]"),
-      # The Primary Parser
+      nl_or_lf:       encode_re("[\r\n]"),
-      csv_row:        encode_re(
+      stray_quote:    encode_re( "[^", esc_quote, "]", esc_quote,
-        "\\G(?:\\A|", esc_col_sep, ")",                # anchor the match
+                                 "[^", esc_quote, "]" ),
        "(?:", esc_quote,                              # find quoted fields
               "((?>[^", esc_quote, "]*)",             # "unrolling the loop"
               "(?>", esc_quote * 2,                   # double for escaping
               "[^", esc_quote, "]*)*)",
               esc_quote,
               "|",                                    # ... or ...
               "([^", esc_quote, esc_col_sep, "]*))",  # unquoted fields
        "(?=", esc_col_sep, "|\\z)"                    # ensure field is ended
      ),
      # a test for unescaped quotes
      bad_field:      encode_re(
        "\\A", esc_col_sep, "?",                   # an optional comma
        "(?:", esc_quote,                          # a quoted field
               "(?>[^", esc_quote, "]*)",          # "unrolling the loop"
               "(?>", esc_quote * 2,               # double for escaping
               "[^", esc_quote, "]*)*",
               esc_quote,                          # the closing quote
               "[^", esc_quote, "]",               # an extra character
               "|",                                # ... or ...
               "[^", esc_quote, esc_col_sep, "]+", # an unquoted field
               esc_quote, ")"                      # an extra quote
      ),
      # safer than chomp!()
      line_end:       encode_re(esc_row_sep, "\\z"),
      # illegal unquoted characters
--- a/test/csv/test_csv_parsing.rb
+++ b/test/csv/test_csv_parsing.rb
@ -115,6 +115,18 @@ class TestCSVParsing < Test::Unit::TestCase
    assert_equal(Array.new, CSV.parse_line("\n1,2,3\n"))
  end
  def test_non_regex_edge_cases
    # An early version of the non-regex parser fails this test
    [ [ "foo,\"foo,bar,baz,foo\",\"foo\"",
        ["foo", "foo,bar,baz,foo", "foo"] ] ].each do |edge_case|
      assert_equal(edge_case.last, CSV.parse_line(edge_case.first))
    end
    assert_raise(CSV::MalformedCSVError) do
      CSV.parse_line("1,\"23\"4\"5\", 6")
    end
  end
  def test_malformed_csv
    assert_raise(CSV::MalformedCSVError) do
      CSV.parse_line("1,2\r,3", row_sep: "\n")
--- a/test/csv/test_interface.rb
+++ b/test/csv/test_interface.rb
@ -75,6 +75,11 @@ class TestCSVInterface < Test::Unit::TestCase
    assert_equal(%w{1 2 3}, row)
  end
  def test_parse_line_with_empty_lines
    assert_equal(nil,       CSV.parse_line(""))  # to signal eof
    assert_equal(Array.new, CSV.parse_line("\n1,2,3"))
  end
  def test_read_and_readlines
    assert_equal( @expected,
                  CSV.read(@path, col_sep: "\t", row_sep: "\r\n") )
@ -167,7 +172,7 @@ class TestCSVInterface < Test::Unit::TestCase
      csv << lines.first.keys
      lines.each { |line| csv << line }
    end
-    CSV.open( @path, "w", headers:           true,
+    CSV.open( @path, "r", headers:           true,
                          converters:        :all,
                          header_converters: :symbol ) do |csv|
      csv.each { |line| assert_equal(lines.shift, line.to_hash) }
--- a/test/csv/test_table.rb
+++ b/test/csv/test_table.rb
@ -253,6 +253,8 @@ class TestCSVTable < Test::Unit::TestCase
    # with options
    assert_equal( csv.gsub(",", "|").gsub("\n", "\r\n"),
                  @table.to_csv(col_sep: "|", row_sep: "\r\n") )
    assert_equal( csv.lines.to_a[1..-1].join,
                  @table.to_csv(:write_headers => false) )
    # with headers
    assert_equal(csv, @header_table.to_csv)