* tool/enc-unicode.rb,

enc/unicode/name2ctype.h, enc/unicode/name2ctype.h.blt,
  enc/unicode/name2ctype.kwd, enc/unicode/name2ctype.src:
  use UTS#18 for POSIX character class.
  http://rubyspec.org/issues/show/161

git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@25338 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
This commit is contained in:
naruse 2009-10-14 16:51:52 +00:00
parent 6dd93ff60d
commit d5537936ab
6 changed files with 8268 additions and 15741 deletions

View File

@ -1,3 +1,11 @@
Thu Oct 15 00:47:42 2009 NARUSE, Yui <naruse@ruby-lang.org>
* tool/enc-unicode.rb,
enc/unicode/name2ctype.h, enc/unicode/name2ctype.h.blt,
enc/unicode/name2ctype.kwd, enc/unicode/name2ctype.src:
use UTS#18 for POSIX character class.
http://rubyspec.org/issues/show/161
Thu Oct 15 00:26:07 2009 Tanaka Akira <akr@fsij.org> Thu Oct 15 00:26:07 2009 Tanaka Akira <akr@fsij.org>
* ext/socket/init.c (rsock_init_sock): validate file descriptor. * ext/socket/init.c (rsock_init_sock): validate file descriptor.

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -47,7 +47,8 @@ end
def parse_unicode_data(file) def parse_unicode_data(file)
last_cp = 0 last_cp = 0
data = {'Any' => [], 'Assigned' => [], 'Cn' => []} data = {'Any' => (0x0000..0x10ffff).to_a, 'Assigned' => [],
'ASCII' => (0..0x007F).to_a, 'NEWLINE' => [0x0a], 'Cn' => []}
beg_cp = nil beg_cp = nil
IO.foreach(file) do |line| IO.foreach(file) do |line|
fields = line.split(';') fields = line.split(';')
@ -92,111 +93,76 @@ def parse_unicode_data(file)
data['C'] += cn_remainder data['C'] += cn_remainder
# Define General Category properties # Define General Category properties
gcps = data.keys.sort gcps = data.keys.sort - POSIX_NAMES
# We now derive the character classes (POSIX brackets), e.g. [[:alpha:]]
#
# alnum Letter | Mark | Decimal_Number
data['Alnum'] = data['L'] + data['M'] + data['Nd']
# alpha Letter | Mark
data['Alpha'] = data['L'] + data['M']
# ascii 0000 - 007F
data['ASCII'] = (0..0x007F).to_a
# blank Space_Separator | 0009
data['Blank'] = data['Zs'] + [0x0009]
# cntrl Control
data['Cntrl'] = data['Cc']
# digit Decimal_Number
data['Digit'] = data['Nd']
# lower Lowercase_Letter
data['Lower'] = data['Ll']
# punct Connector_Punctuation | Dash_Punctuation | Close_Punctuation |
# Final_Punctuation | Initial_Punctuation | Other_Punctuation |
# Open_Punctuation
# NOTE: This definition encompasses the entire P category, and the current
# mappings agree, but we explcitly declare this way to marry it with the above
# definition.
data['Punct'] = data['Pc'] + data['Pd'] + data['Pe'] + data['Pf'] +
data['Pi'] + data['Po'] + data['Ps']
# space Space_Separator | Line_Separator | Paragraph_Separator |
# 0009 | 000A | 000B | 000C | 000D | 0085
data['Space'] = data['Zs'] + data['Zl'] + data['Zp'] +
[0x0009, 0x000A, 0x000B, 0x000C, 0x000D, 0x0085]
# upper Uppercase_Letter
data['Upper'] = data['Lu']
# xdigit 0030 - 0039 | 0041 - 0046 | 0061 - 0066
# (0-9, a-f, A-F)
data['XDigit'] = (0x0030..0x0039).to_a + (0x0041..0x0046).to_a +
(0x0061..0x0066).to_a
# word Letter | Mark | Decimal_Number | Connector_Punctuation
data['Word'] = data['L'] + data['M'] + data['Nd'] + data['Pc']
# graph [[:^space:]] && ^Control && ^Unassigned && ^Surrogate
data['Graph'] = data['L'] + data['M'] + data['N'] + data['P'] + data['S']
data['Graph'] -= data['Space'] - data['C']
# print [[:graph:]] | [[:space:]]
data['Print'] = data['Graph'] + data['Space']
# NEWLINE - This was defined in unicode.c
data['NEWLINE'] = [0x000a]
# Any - Defined in unicode.c
data['Any'] = (0x0000..0x10ffff).to_a
# Returns General Category Property names and the data # Returns General Category Property names and the data
[gcps, data] [gcps, data]
end end
def define_posix_props(data)
# We now derive the character classes (POSIX brackets), e.g. [[:alpha:]]
#
def parse_scripts data['Alpha'] = data['Alphabetic']
data['Upper'] = data['Uppercase']
data['Lower'] = data['Lowercase']
data['Punct'] = data['Punctuation']
data['Digit'] = data['Decimal_Number']
data['XDigit'] = (0x0030..0x0039).to_a + (0x0041..0x0046).to_a +
(0x0061..0x0066).to_a
data['Alnum'] = data['Alpha'] + data['Digit']
data['Space'] = data['White_Space']
data['Blank'] = data['White_Space'] - [0x0A, 0x0B, 0x0C, 0x0D, 0x85] -
data['Line_Separator'] - data['Paragraph_Separator']
data['Cntrl'] = data['Cc']
data['Word'] = data['Alpha'] + data['Mark'] + data['Digit'] + data['Connector_Punctuation']
data['Graph'] = data['Any'] - data['Space'] - data['Cntrl'] -
data['Surrogate'] - data['Unassigned']
data['Print'] = data['Graph'] + data['Blank'] - data['Cntrl']
end
def parse_scripts(data)
files = [ files = [
{fn: 'DerivedCoreProperties.txt', title: 'Derived Property'}, {fn: 'DerivedCoreProperties.txt', title: 'Derived Property'},
{fn: 'Scripts.txt', title: 'Script'}, {fn: 'Scripts.txt', title: 'Script'},
{fn: 'PropList.txt', title: 'Binary Property'} {fn: 'PropList.txt', title: 'Binary Property'}
] ]
current = nil current = nil
data = [] cps = []
names = [] names = []
files.each do |file| files.each do |file|
IO.foreach(get_file(file[:fn])) do |line| IO.foreach(get_file(file[:fn])) do |line|
if /^# Total code points: / =~ line if /^# Total code points: / =~ line
make_const(current, pair_codepoints(data), file[:title]) data[current] = cps
make_const(current, cps, file[:title])
names << current names << current
data = [] cps = []
elsif /^(\h+)(?:..(\h+))?\s*;\s*(\w+)/ =~ line elsif /^(\h+)(?:..(\h+))?\s*;\s*(\w+)/ =~ line
current = $3 current = $3
$2 ? data.concat(($1.to_i(16)..$2.to_i(16)).to_a) : data.push($1.to_i(16)) $2 ? cps.concat(($1.to_i(16)..$2.to_i(16)).to_a) : cps.push($1.to_i(16))
end end
end end
end end
names names
end end
def parse_aliases def parse_aliases(data)
kv = {} kv = {}
IO.foreach(get_file('PropertyAliases.txt')) do |line| IO.foreach(get_file('PropertyAliases.txt')) do |line|
next unless /^(\w+)\s*; (\w+)/ =~ line next unless /^(\w+)\s*; (\w+)/ =~ line
data[$1] = data[$2]
kv[normalize_propname($1)] = normalize_propname($2) kv[normalize_propname($1)] = normalize_propname($2)
end end
IO.foreach(get_file('PropertyValueAliases.txt')) do |line| IO.foreach(get_file('PropertyValueAliases.txt')) do |line|
next unless /^(sc|gc)\s*; (\w+)\s*; (\w+)(?:\s*; (\w+))?/ =~ line next unless /^(sc|gc)\s*; (\w+)\s*; (\w+)(?:\s*; (\w+))?/ =~ line
if $1 == 'gc' if $1 == 'gc'
data[$3] = data[$2]
data[$4] = data[$2]
kv[normalize_propname($3)] = normalize_propname($2) kv[normalize_propname($3)] = normalize_propname($2)
kv[normalize_propname($4)] = normalize_propname($2) if $4 kv[normalize_propname($4)] = normalize_propname($2) if $4
else else
data[$2] = data[$3]
data[$4] = data[$3]
kv[normalize_propname($2)] = normalize_propname($3) kv[normalize_propname($2)] = normalize_propname($3)
kv[normalize_propname($4)] = normalize_propname($3) if $4 kv[normalize_propname($4)] = normalize_propname($3) if $4
end end
@ -204,19 +170,26 @@ def parse_aliases
kv kv
end end
$const_cache = {}
# make_const(property, pairs, name): Prints a 'static const' structure for a # make_const(property, pairs, name): Prints a 'static const' structure for a
# given property, group of paired codepoints, and a human-friendly name for # given property, group of paired codepoints, and a human-friendly name for
# the group # the group
def make_const(prop, pairs, name) def make_const(prop, data, name)
puts "\n/* '#{prop}': #{name} */" puts "\n/* '#{prop}': #{name} */"
puts "static const OnigCodePoint CR_#{prop}[] = {" if origprop = $const_cache.key(data)
# The first element of the constant is the number of pairs of codepoints puts "#define CR_#{prop} CR_#{origprop}"
puts "\t#{pairs.size}," else
pairs.each do |pair| $const_cache[prop] = data
pair.map! { |c| c == 0 ? '0x0000' : sprintf("%0#6x", c) } pairs = pair_codepoints(data)
puts "\t#{pair.first}, #{pair.last}," puts "static const OnigCodePoint CR_#{prop}[] = {"
# The first element of the constant is the number of pairs of codepoints
puts "\t#{pairs.size},"
pairs.each do |pair|
pair.map! { |c| c == 0 ? '0x0000' : sprintf("%0#6x", c) }
puts "\t#{pair.first}, #{pair.last},"
end
puts "}; /* CR_#{prop} */"
end end
puts "}; /* CR_#{prop} */"
end end
def normalize_propname(name) def normalize_propname(name)
@ -233,9 +206,6 @@ end
# Write Data # Write Data
puts '%{' puts '%{'
props, data = parse_unicode_data(get_file('UnicodeData.txt')) props, data = parse_unicode_data(get_file('UnicodeData.txt'))
POSIX_NAMES.each do |name|
make_const(name, pair_codepoints(data[name]), "[[:#{name}:]]")
end
print "\n#ifdef USE_UNICODE_PROPERTIES" print "\n#ifdef USE_UNICODE_PROPERTIES"
props.each do |name| props.each do |name|
category = category =
@ -244,11 +214,16 @@ props.each do |name|
when 2 then 'General Category' when 2 then 'General Category'
else '-' else '-'
end end
make_const(name, pair_codepoints(data[name]), category) make_const(name, data[name], category)
end
props.concat parse_scripts(data)
puts '#endif /* USE_UNICODE_PROPERTIES */'
aliases = parse_aliases(data)
define_posix_props(data)
POSIX_NAMES.each do |name|
make_const(name, data[name], "[[:#{name}:]]")
end end
props.concat parse_scripts
puts(<<'__HEREDOC') puts(<<'__HEREDOC')
#endif /* USE_UNICODE_PROPERTIES */
static const OnigCodePoint* const CodeRanges[] = { static const OnigCodePoint* const CodeRanges[] = {
__HEREDOC __HEREDOC
@ -283,7 +258,7 @@ props.each do |name|
name_to_index[name] = i name_to_index[name] = i
puts "%-40s %3d" % [name + ',', i] puts "%-40s %3d" % [name + ',', i]
end end
parse_aliases.each_pair do |k, v| aliases.each_pair do |k, v|
next if name_to_index[k] next if name_to_index[k]
next unless v = name_to_index[v] next unless v = name_to_index[v]
puts "%-40s %3d" % [k + ',', v] puts "%-40s %3d" % [k + ',', v]