[ruby/rdoc] Refactor RDoc::Markup::Parser#tokenize

Make verbatims text or newline only, and simplify `build_verbatim`. https://github.com/ruby/rdoc/commit/41ceae93b3
2022-10-05 20:07:02 +09:00 · 2022-10-05 20:07:02 +09:00 · 21977b95e2
commit 21977b95e2
parent 29862ce273
2 changed files with 116 additions and 115 deletions
--- a/lib/rdoc/markup/parser.rb
+++ b/lib/rdoc/markup/parser.rb
@ -272,44 +272,11 @@ class RDoc::Markup::Parser
      end
      case type
      when :HEADER then
        line << '=' * data
        _, _, peek_column, = peek_token
        peek_column ||= column + data
        indent = peek_column - column - data
        line << ' ' * indent
      when :RULE then
        width = 2 + data
        line << '-' * width
        _, _, peek_column, = peek_token
        peek_column ||= column + width
        indent = peek_column - column - width
        line << ' ' * indent
      when :BREAK, :TEXT then
        line << data
-      when :BLOCKQUOTE then
+      else
-        line << '>>>'
+        raise TypeError, "unexpected token under verbatim: #{type}"
        peek_type, _, peek_column = peek_token
        if peek_type != :NEWLINE and peek_column
          line << ' ' * (peek_column - column - 3)
        end
      else # *LIST_TOKENS
        list_marker = case type
                      when :BULLET then data
                      when :LABEL  then "[#{data}]"
                      when :NOTE   then "#{data}::"
                      else # :LALPHA, :NUMBER, :UALPHA
                        "#{data}."
                      end
        line << list_marker
        peek_type, _, peek_column = peek_token
        unless peek_type == :NEWLINE then
          peek_column ||= column + list_marker.length
          indent = peek_column - column - list_marker.length
          line << ' ' * indent
        end
      end
    end
    verbatim << line << "\n" unless line.empty?
@ -481,11 +448,37 @@ class RDoc::Markup::Parser
  ##
  # Turns text +input+ into a stream of tokens
-  def tokenize input
+  def tokenize(input)
    setup_scanner input
    margin = @s.pos[0]
    tokenize_indented(margin)
    tokenize_input(margin)
  end
-    until @s.eos? do
+  def newline!(pos = nil)
    if pos or (@s.scan(/ *(?=\r?\n)/) and pos = @s.pos and @s.scan(/\r?\n/))
      @tokens << [:NEWLINE, @s.matched, *pos]
      @s.newline!
    end
  end
  def tokenize_indented(column)
    indent = / {#{column+1},}(?=\S)| *(?=\r?\n)/
    while @s.scan(indent)
      pos = @s.pos
      if @s.scan(/(.+)(?=\r?\n)?/)
        @tokens << [:TEXT, @s.matched, *pos]
      end
      newline! or break
    end
  end
  def tokenize_input(margin)
    column = 0
    until @s.eos?
      pos = @s.pos
      break if pos[0] < (margin ||= pos[0])
      # leading spaces will be reflected by the column of the next token
      # the only thing we loose are trailing spaces at the end of the file
@ -494,75 +487,84 @@ class RDoc::Markup::Parser
      # note: after BULLET, LABEL, etc.,
      # indent will be the column of the next non-newline token
-      @tokens << case
+      case
-                 # [CR]LF => :NEWLINE
+      # [CR]LF => :NEWLINE
-                 when @s.scan(/\r?\n/) then
+      when @s.scan(/\r?\n/)
-                   token = [:NEWLINE, @s.matched, *pos]
+        newline!(pos)
-                   @s.newline!
+        next
                   token
                 # === text => :HEADER then :TEXT
                 when @s.scan(/(=+)(\s*)/) then
                   level = @s[1].length
                   header = [:HEADER, level, *pos]
-                   if @s[2] =~ /^\r?\n/ then
+      # === text => :HEADER then :TEXT
-                     @s.unscan(@s[2])
+      when @s.scan(/(=+)(\s*)/)
-                     header
+        level = @s[1].length
-                   else
+        header = [:HEADER, level, *pos]
                     pos = @s.pos
                     @s.scan(/.*/)
                     @tokens << header
                     [:TEXT, @s.matched.sub(/\r$/, ''), *pos]
                   end
                 # --- (at least 3) and nothing else on the line => :RULE
                 when @s.scan(/(-{3,}) *\r?$/) then
                   [:RULE, @s[1].length - 2, *pos]
                 # * or - followed by white space and text => :BULLET
                 when @s.scan(/([*-]) +(\S)/) then
                   @s.unscan(@s[2])
                   [:BULLET, @s[1], *pos]
                 # A. text, a. text, 12. text => :UALPHA, :LALPHA, :NUMBER
                 when @s.scan(/([a-z]|\d+)\. +(\S)/i) then
                   # FIXME if tab(s), the column will be wrong
                   # either support tabs everywhere by first expanding them to
                   # spaces, or assume that they will have been replaced
                   # before (and provide a check for that at least in debug
                   # mode)
                   list_label = @s[1]
                   @s.unscan(@s[2])
                   list_type =
                     case list_label
                     when /[a-z]/ then :LALPHA
                     when /[A-Z]/ then :UALPHA
                     when /\d/    then :NUMBER
                     else
                       raise ParseError, "BUG token #{list_label}"
                     end
                   [list_type, list_label, *pos]
                 # [text] followed by spaces or end of line => :LABEL
                 when @s.scan(/\[(.*?)\]( +|\r?$)/) then
                   [:LABEL, @s[1], *pos]
                 # text:: followed by spaces or end of line => :NOTE
                 when @s.scan(/(.*?)::( +|\r?$)/) then
                   [:NOTE, @s[1], *pos]
                 # >>> followed by end of line => :BLOCKQUOTE
                 when @s.scan(/>>> *(\w+)?$/) then
                   if word = @s[1]
                     @s.unscan(word)
                   end
                   [:BLOCKQUOTE, word, *pos]
                 # anything else: :TEXT
                 else
                   @s.scan(/(.*?)(  )?\r?$/)
                   token = [:TEXT, @s[1], *pos]
-                   if @s[2] then
+        if @s[2] =~ /^\r?\n/
-                     @tokens << token
+          @s.unscan(@s[2])
-                     [:BREAK, @s[2], pos[0] + @s[1].length, pos[1]]
+          @tokens << header
-                   else
+        else
-                     token
+          pos = @s.pos
-                   end
+          @s.scan(/.*/)
-                 end
+          @tokens << header
          @tokens << [:TEXT, @s.matched.sub(/\r$/, ''), *pos]
        end
      # --- (at least 3) and nothing else on the line => :RULE
      when @s.scan(/(-{3,}) *\r?$/)
        @tokens << [:RULE, @s[1].length - 2, *pos]
      # * or - followed by white space and text => :BULLET
      when @s.scan(/([*-]) +(?=\S)/)
        @tokens << [:BULLET, @s[1], *pos]
        tokenize_input(nil)
      # A. text, a. text, 12. text => :UALPHA, :LALPHA, :NUMBER
      when @s.scan(/([a-z]|\d+)\. +(?=\S)/i)
        # FIXME if tab(s), the column will be wrong
        # either support tabs everywhere by first expanding them to
        # spaces, or assume that they will have been replaced
        # before (and provide a check for that at least in debug
        # mode)
        list_label = @s[1]
        list_type =
          case list_label
          when /[a-z]/ then :LALPHA
          when /[A-Z]/ then :UALPHA
          when /\d/    then :NUMBER
          else
            raise ParseError, "BUG token #{list_label}"
          end
        @tokens << [list_type, list_label, *pos]
        tokenize_input(nil)
      # [text] followed by spaces or end of line => :LABEL
      when @s.scan(/\[(.*?)\]( +|\r?$)/)
        @tokens << [:LABEL, @s[1], *pos]
        tokenize_input(nil)
      # text:: followed by spaces or end of line => :NOTE
      when @s.scan(/(.*?)::( +|\r?$)/)
        @tokens << [:NOTE, @s[1], *pos]
        tokenize_input(nil)
      # >>> followed by end of line => :BLOCKQUOTE
      when @s.scan(/>>> *(\w+)?\r?$/)
        @tokens << [:BLOCKQUOTE, @s[1], *pos]
        newline!
        tokenize_input(nil)
      # anything else: :TEXT
      else
        column = pos[0]
        @s.scan(/(.*?)(  )?\r?$/)
        @tokens << [:TEXT, @s[1], *pos]
        if @s[2]
          @tokens << [:BREAK, @s[2], pos[0] + @s[1].length, pos[1]]
        end
        if newline!
          tokenize_indented(column)
        end
      end
    end
    self
--- a/test/rdoc/test_rdoc_markup_parser.rb
+++ b/test/rdoc/test_rdoc_markup_parser.rb
@ -1591,8 +1591,7 @@ Example heading:
      [:TEXT,    'Example heading:',  0, 0],
      [:NEWLINE, "\n",               16, 0],
      [:NEWLINE, "\n",                0, 1],
-      [:HEADER,  3,                   3, 2],
+      [:TEXT,    '=== heading three', 3, 2],
      [:TEXT,    'heading three',     7, 2],
      [:NEWLINE, "\n",               20, 2],
    ]
@ -1608,7 +1607,7 @@ Example heading:
    expected = [
      [:TEXT,    'Verbatim section here that is double-underlined',  2, 0],
      [:NEWLINE, "\n",                                              49, 0],
-      [:HEADER,  47,                                                 2, 1],
+      [:TEXT,    '='*47,                                             2, 1],
      [:NEWLINE, "\n",                                              49, 1],
    ]
@ -1624,14 +1623,14 @@ Example heading:
    STR
    expected = [
-      [:TEXT,    'A',   2, 0],
+      [:TEXT,    'A',     2, 0],
-      [:NEWLINE, "\n",  3, 0],
+      [:NEWLINE, "\n",    3, 0],
-      [:TEXT,    'b',   4, 1],
+      [:TEXT,    'b',     4, 1],
-      [:NEWLINE, "\n",  5, 1],
+      [:NEWLINE, "\n",    5, 1],
-      [:HEADER,  47,    2, 2],
+      [:TEXT,    '='*47,  2, 2],
-      [:NEWLINE, "\n", 49, 2],
+      [:NEWLINE, "\n",   49, 2],
-      [:TEXT,    'c',   4, 3],
+      [:TEXT,    'c',     4, 3],
-      [:NEWLINE, "\n",  5, 3],
+      [:NEWLINE, "\n",    5, 3],
    ]
    assert_equal expected, @RMP.tokenize(str)