* tool/enc-unicode.rb,

enc/unicode/name2ctype.h, enc/unicode/name2ctype.h.blt, enc/unicode/name2ctype.kwd, enc/unicode/name2ctype.src: use UTS#18 for POSIX character class. http://rubyspec.org/issues/show/161 git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@25338 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2009-10-14 16:51:52 +00:00 · 2009-10-14 16:51:52 +00:00 · d5537936ab
commit d5537936ab
parent 6dd93ff60d
6 changed files with 8268 additions and 15741 deletions
--- a/8
+++ b/8
@ -1,3 +1,11 @@
 Thu Oct 15 00:47:42 2009  NARUSE, Yui  <naruse@ruby-lang.org>
 	* tool/enc-unicode.rb,
 	  enc/unicode/name2ctype.h, enc/unicode/name2ctype.h.blt,
 	  enc/unicode/name2ctype.kwd, enc/unicode/name2ctype.src:
 	  use UTS#18 for POSIX character class.
 	  http://rubyspec.org/issues/show/161
 Thu Oct 15 00:26:07 2009  Tanaka Akira  <akr@fsij.org>
 	* ext/socket/init.c (rsock_init_sock): validate file descriptor.
--- a/enc/unicode/name2ctype.h
+++ b/enc/unicode/name2ctype.h
--- a/enc/unicode/name2ctype.h.blt
+++ b/enc/unicode/name2ctype.h.blt
--- a/enc/unicode/name2ctype.kwd
+++ b/enc/unicode/name2ctype.kwd
--- a/enc/unicode/name2ctype.src
+++ b/enc/unicode/name2ctype.src
--- a/tool/enc-unicode.rb
+++ b/tool/enc-unicode.rb
@ -47,7 +47,8 @@ end
 def parse_unicode_data(file)
  last_cp = 0
-  data = {'Any' => [], 'Assigned' => [], 'Cn' => []}
+  data = {'Any' => (0x0000..0x10ffff).to_a, 'Assigned' => [],
    'ASCII' => (0..0x007F).to_a, 'NEWLINE' => [0x0a], 'Cn' => []}
  beg_cp = nil
  IO.foreach(file) do |line|
    fields = line.split(';')
@ -92,111 +93,76 @@ def parse_unicode_data(file)
  data['C'] += cn_remainder
  # Define General Category properties
-  gcps = data.keys.sort
+  gcps = data.keys.sort - POSIX_NAMES
  # We now derive the character classes (POSIX brackets), e.g. [[:alpha:]]
  #
  # alnum    Letter | Mark | Decimal_Number
  data['Alnum'] = data['L'] + data['M'] + data['Nd']
  # alpha    Letter | Mark
  data['Alpha'] = data['L'] + data['M']
  # ascii    0000 - 007F
  data['ASCII'] = (0..0x007F).to_a
  # blank    Space_Separator | 0009
  data['Blank'] = data['Zs'] + [0x0009]
  # cntrl    Control
  data['Cntrl'] = data['Cc']
  # digit    Decimal_Number
  data['Digit'] = data['Nd']
  # lower    Lowercase_Letter
  data['Lower'] = data['Ll']
  # punct    Connector_Punctuation | Dash_Punctuation | Close_Punctuation |
  #          Final_Punctuation | Initial_Punctuation | Other_Punctuation |
  #          Open_Punctuation
  # NOTE: This definition encompasses the entire P category, and the current
  # mappings agree, but we explcitly declare this way to marry it with the above
  # definition.
  data['Punct'] = data['Pc'] + data['Pd'] + data['Pe'] + data['Pf'] +
                  data['Pi'] + data['Po'] + data['Ps']
  # space    Space_Separator | Line_Separator | Paragraph_Separator |
  #               0009 | 000A | 000B | 000C | 000D | 0085
  data['Space'] = data['Zs'] + data['Zl'] + data['Zp'] +
                  [0x0009, 0x000A, 0x000B, 0x000C, 0x000D, 0x0085]
  # upper    Uppercase_Letter
  data['Upper'] = data['Lu']
  # xdigit   0030 - 0039 | 0041 - 0046 | 0061 - 0066
  #          (0-9, a-f, A-F)
  data['XDigit'] = (0x0030..0x0039).to_a + (0x0041..0x0046).to_a +
                   (0x0061..0x0066).to_a
  # word     Letter | Mark | Decimal_Number | Connector_Punctuation
  data['Word'] = data['L'] + data['M'] + data['Nd'] + data['Pc']
  # graph    [[:^space:]] && ^Control && ^Unassigned && ^Surrogate
  data['Graph'] = data['L'] + data['M'] + data['N'] + data['P'] + data['S']
  data['Graph'] -= data['Space'] - data['C']
  # print    [[:graph:]] | [[:space:]]
  data['Print'] = data['Graph'] + data['Space']
  # NEWLINE - This was defined in unicode.c
  data['NEWLINE'] = [0x000a]
  # Any - Defined in unicode.c
  data['Any'] = (0x0000..0x10ffff).to_a
  # Returns General Category Property names and the data
  [gcps, data]
 end
 def define_posix_props(data)
  # We now derive the character classes (POSIX brackets), e.g. [[:alpha:]]
  #
-def parse_scripts
+  data['Alpha'] = data['Alphabetic']
  data['Upper'] = data['Uppercase']
  data['Lower'] = data['Lowercase']
  data['Punct'] = data['Punctuation']
  data['Digit'] = data['Decimal_Number']
  data['XDigit'] = (0x0030..0x0039).to_a + (0x0041..0x0046).to_a +
                   (0x0061..0x0066).to_a
  data['Alnum'] = data['Alpha'] + data['Digit']
  data['Space'] = data['White_Space']
  data['Blank'] = data['White_Space'] - [0x0A, 0x0B, 0x0C, 0x0D, 0x85] -
    data['Line_Separator'] - data['Paragraph_Separator']
  data['Cntrl'] = data['Cc']
  data['Word'] = data['Alpha'] + data['Mark'] + data['Digit'] + data['Connector_Punctuation']
  data['Graph'] = data['Any'] - data['Space'] - data['Cntrl'] -
    data['Surrogate'] - data['Unassigned']
  data['Print'] = data['Graph'] + data['Blank'] - data['Cntrl']
 end
 def parse_scripts(data)
  files = [
    {fn: 'DerivedCoreProperties.txt', title: 'Derived Property'},
    {fn: 'Scripts.txt', title: 'Script'},
    {fn: 'PropList.txt', title: 'Binary Property'}
  ]
  current = nil
-  data = []
+  cps = []
  names = []
  files.each do |file|
    IO.foreach(get_file(file[:fn])) do |line|
      if /^# Total code points: / =~ line
-        make_const(current, pair_codepoints(data), file[:title])
+        data[current] = cps
        make_const(current, cps, file[:title])
        names << current
-        data = []
+        cps = []
      elsif /^(\h+)(?:..(\h+))?\s*;\s*(\w+)/ =~ line
        current = $3
-        $2 ? data.concat(($1.to_i(16)..$2.to_i(16)).to_a) : data.push($1.to_i(16))
+        $2 ? cps.concat(($1.to_i(16)..$2.to_i(16)).to_a) : cps.push($1.to_i(16))
      end
    end
  end
  names
 end
-def parse_aliases
+def parse_aliases(data)
  kv = {}
  IO.foreach(get_file('PropertyAliases.txt')) do |line|
    next unless /^(\w+)\s*; (\w+)/ =~ line
    data[$1] = data[$2]
    kv[normalize_propname($1)] = normalize_propname($2)
  end
  IO.foreach(get_file('PropertyValueAliases.txt')) do |line|
    next unless /^(sc|gc)\s*; (\w+)\s*; (\w+)(?:\s*; (\w+))?/ =~ line
    if $1 == 'gc'
      data[$3] = data[$2]
      data[$4] = data[$2]
      kv[normalize_propname($3)] = normalize_propname($2)
      kv[normalize_propname($4)] = normalize_propname($2) if $4
    else
      data[$2] = data[$3]
      data[$4] = data[$3]
      kv[normalize_propname($2)] = normalize_propname($3)
      kv[normalize_propname($4)] = normalize_propname($3) if $4
    end
@ -204,19 +170,26 @@ def parse_aliases
  kv
 end
 $const_cache = {}
 # make_const(property, pairs, name): Prints a 'static const' structure for a
 # given property, group of paired codepoints, and a human-friendly name for
 # the group
-def make_const(prop, pairs, name)
+def make_const(prop, data, name)
  puts "\n/* '#{prop}': #{name} */"
-  puts "static const OnigCodePoint CR_#{prop}[] = {"
+  if origprop = $const_cache.key(data)
-  # The first element of the constant is the number of pairs of codepoints
+    puts "#define CR_#{prop} CR_#{origprop}"
-  puts "\t#{pairs.size},"
+  else
-  pairs.each do |pair|
+    $const_cache[prop] = data
-    pair.map! { |c|  c == 0 ? '0x0000' : sprintf("%0#6x", c) }
+    pairs = pair_codepoints(data)
-    puts "\t#{pair.first}, #{pair.last},"
+    puts "static const OnigCodePoint CR_#{prop}[] = {"
    # The first element of the constant is the number of pairs of codepoints
    puts "\t#{pairs.size},"
    pairs.each do |pair|
      pair.map! { |c|  c == 0 ? '0x0000' : sprintf("%0#6x", c) }
      puts "\t#{pair.first}, #{pair.last},"
    end
    puts "}; /* CR_#{prop} */"
  end
  puts "}; /* CR_#{prop} */"
 end
 def normalize_propname(name)
@ -233,9 +206,6 @@ end
 # Write Data
 puts '%{'
 props, data = parse_unicode_data(get_file('UnicodeData.txt'))
 POSIX_NAMES.each do |name|
  make_const(name, pair_codepoints(data[name]), "[[:#{name}:]]")
 end
 print "\n#ifdef USE_UNICODE_PROPERTIES"
 props.each do |name|
  category =
@ -244,11 +214,16 @@ props.each do |name|
    when 2 then 'General Category'
    else        '-'
    end
-  make_const(name, pair_codepoints(data[name]), category)
+  make_const(name, data[name], category)
 end
 props.concat parse_scripts(data)
 puts '#endif /* USE_UNICODE_PROPERTIES */'
 aliases = parse_aliases(data)
 define_posix_props(data)
 POSIX_NAMES.each do |name|
  make_const(name, data[name], "[[:#{name}:]]")
 end
 props.concat parse_scripts
 puts(<<'__HEREDOC')
 #endif /* USE_UNICODE_PROPERTIES */
 static const OnigCodePoint* const CodeRanges[] = {
 __HEREDOC
@ -283,7 +258,7 @@ props.each do |name|
  name_to_index[name] = i
  puts "%-40s %3d" % [name + ',', i]
 end
-parse_aliases.each_pair do |k, v|
+aliases.each_pair do |k, v|
  next if name_to_index[k]
  next unless v = name_to_index[v]
  puts "%-40s %3d" % [k + ',', v]