* tool/enc-unicode.rb,
enc/unicode/name2ctype.h, enc/unicode/name2ctype.h.blt, enc/unicode/name2ctype.kwd, enc/unicode/name2ctype.src: use UTS#18 for POSIX character class. http://rubyspec.org/issues/show/161 git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@25338 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
This commit is contained in:
parent
6dd93ff60d
commit
d5537936ab
@ -1,3 +1,11 @@
|
||||
Thu Oct 15 00:47:42 2009 NARUSE, Yui <naruse@ruby-lang.org>
|
||||
|
||||
* tool/enc-unicode.rb,
|
||||
enc/unicode/name2ctype.h, enc/unicode/name2ctype.h.blt,
|
||||
enc/unicode/name2ctype.kwd, enc/unicode/name2ctype.src:
|
||||
use UTS#18 for POSIX character class.
|
||||
http://rubyspec.org/issues/show/161
|
||||
|
||||
Thu Oct 15 00:26:07 2009 Tanaka Akira <akr@fsij.org>
|
||||
|
||||
* ext/socket/init.c (rsock_init_sock): validate file descriptor.
|
||||
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -47,7 +47,8 @@ end
|
||||
|
||||
def parse_unicode_data(file)
|
||||
last_cp = 0
|
||||
data = {'Any' => [], 'Assigned' => [], 'Cn' => []}
|
||||
data = {'Any' => (0x0000..0x10ffff).to_a, 'Assigned' => [],
|
||||
'ASCII' => (0..0x007F).to_a, 'NEWLINE' => [0x0a], 'Cn' => []}
|
||||
beg_cp = nil
|
||||
IO.foreach(file) do |line|
|
||||
fields = line.split(';')
|
||||
@ -92,111 +93,76 @@ def parse_unicode_data(file)
|
||||
data['C'] += cn_remainder
|
||||
|
||||
# Define General Category properties
|
||||
gcps = data.keys.sort
|
||||
|
||||
# We now derive the character classes (POSIX brackets), e.g. [[:alpha:]]
|
||||
#
|
||||
|
||||
# alnum Letter | Mark | Decimal_Number
|
||||
data['Alnum'] = data['L'] + data['M'] + data['Nd']
|
||||
|
||||
# alpha Letter | Mark
|
||||
data['Alpha'] = data['L'] + data['M']
|
||||
|
||||
# ascii 0000 - 007F
|
||||
data['ASCII'] = (0..0x007F).to_a
|
||||
|
||||
# blank Space_Separator | 0009
|
||||
data['Blank'] = data['Zs'] + [0x0009]
|
||||
|
||||
# cntrl Control
|
||||
data['Cntrl'] = data['Cc']
|
||||
|
||||
# digit Decimal_Number
|
||||
data['Digit'] = data['Nd']
|
||||
|
||||
# lower Lowercase_Letter
|
||||
data['Lower'] = data['Ll']
|
||||
|
||||
# punct Connector_Punctuation | Dash_Punctuation | Close_Punctuation |
|
||||
# Final_Punctuation | Initial_Punctuation | Other_Punctuation |
|
||||
# Open_Punctuation
|
||||
# NOTE: This definition encompasses the entire P category, and the current
|
||||
# mappings agree, but we explcitly declare this way to marry it with the above
|
||||
# definition.
|
||||
data['Punct'] = data['Pc'] + data['Pd'] + data['Pe'] + data['Pf'] +
|
||||
data['Pi'] + data['Po'] + data['Ps']
|
||||
|
||||
# space Space_Separator | Line_Separator | Paragraph_Separator |
|
||||
# 0009 | 000A | 000B | 000C | 000D | 0085
|
||||
data['Space'] = data['Zs'] + data['Zl'] + data['Zp'] +
|
||||
[0x0009, 0x000A, 0x000B, 0x000C, 0x000D, 0x0085]
|
||||
|
||||
# upper Uppercase_Letter
|
||||
data['Upper'] = data['Lu']
|
||||
|
||||
# xdigit 0030 - 0039 | 0041 - 0046 | 0061 - 0066
|
||||
# (0-9, a-f, A-F)
|
||||
data['XDigit'] = (0x0030..0x0039).to_a + (0x0041..0x0046).to_a +
|
||||
(0x0061..0x0066).to_a
|
||||
|
||||
# word Letter | Mark | Decimal_Number | Connector_Punctuation
|
||||
data['Word'] = data['L'] + data['M'] + data['Nd'] + data['Pc']
|
||||
|
||||
# graph [[:^space:]] && ^Control && ^Unassigned && ^Surrogate
|
||||
data['Graph'] = data['L'] + data['M'] + data['N'] + data['P'] + data['S']
|
||||
data['Graph'] -= data['Space'] - data['C']
|
||||
|
||||
# print [[:graph:]] | [[:space:]]
|
||||
data['Print'] = data['Graph'] + data['Space']
|
||||
|
||||
# NEWLINE - This was defined in unicode.c
|
||||
data['NEWLINE'] = [0x000a]
|
||||
|
||||
# Any - Defined in unicode.c
|
||||
data['Any'] = (0x0000..0x10ffff).to_a
|
||||
gcps = data.keys.sort - POSIX_NAMES
|
||||
|
||||
# Returns General Category Property names and the data
|
||||
[gcps, data]
|
||||
end
|
||||
|
||||
def define_posix_props(data)
|
||||
# We now derive the character classes (POSIX brackets), e.g. [[:alpha:]]
|
||||
#
|
||||
|
||||
def parse_scripts
|
||||
data['Alpha'] = data['Alphabetic']
|
||||
data['Upper'] = data['Uppercase']
|
||||
data['Lower'] = data['Lowercase']
|
||||
data['Punct'] = data['Punctuation']
|
||||
data['Digit'] = data['Decimal_Number']
|
||||
data['XDigit'] = (0x0030..0x0039).to_a + (0x0041..0x0046).to_a +
|
||||
(0x0061..0x0066).to_a
|
||||
data['Alnum'] = data['Alpha'] + data['Digit']
|
||||
data['Space'] = data['White_Space']
|
||||
data['Blank'] = data['White_Space'] - [0x0A, 0x0B, 0x0C, 0x0D, 0x85] -
|
||||
data['Line_Separator'] - data['Paragraph_Separator']
|
||||
data['Cntrl'] = data['Cc']
|
||||
data['Word'] = data['Alpha'] + data['Mark'] + data['Digit'] + data['Connector_Punctuation']
|
||||
data['Graph'] = data['Any'] - data['Space'] - data['Cntrl'] -
|
||||
data['Surrogate'] - data['Unassigned']
|
||||
data['Print'] = data['Graph'] + data['Blank'] - data['Cntrl']
|
||||
end
|
||||
|
||||
def parse_scripts(data)
|
||||
files = [
|
||||
{fn: 'DerivedCoreProperties.txt', title: 'Derived Property'},
|
||||
{fn: 'Scripts.txt', title: 'Script'},
|
||||
{fn: 'PropList.txt', title: 'Binary Property'}
|
||||
]
|
||||
current = nil
|
||||
data = []
|
||||
cps = []
|
||||
names = []
|
||||
files.each do |file|
|
||||
IO.foreach(get_file(file[:fn])) do |line|
|
||||
if /^# Total code points: / =~ line
|
||||
make_const(current, pair_codepoints(data), file[:title])
|
||||
data[current] = cps
|
||||
make_const(current, cps, file[:title])
|
||||
names << current
|
||||
data = []
|
||||
cps = []
|
||||
elsif /^(\h+)(?:..(\h+))?\s*;\s*(\w+)/ =~ line
|
||||
current = $3
|
||||
$2 ? data.concat(($1.to_i(16)..$2.to_i(16)).to_a) : data.push($1.to_i(16))
|
||||
$2 ? cps.concat(($1.to_i(16)..$2.to_i(16)).to_a) : cps.push($1.to_i(16))
|
||||
end
|
||||
end
|
||||
end
|
||||
names
|
||||
end
|
||||
|
||||
def parse_aliases
|
||||
def parse_aliases(data)
|
||||
kv = {}
|
||||
IO.foreach(get_file('PropertyAliases.txt')) do |line|
|
||||
next unless /^(\w+)\s*; (\w+)/ =~ line
|
||||
data[$1] = data[$2]
|
||||
kv[normalize_propname($1)] = normalize_propname($2)
|
||||
end
|
||||
IO.foreach(get_file('PropertyValueAliases.txt')) do |line|
|
||||
next unless /^(sc|gc)\s*; (\w+)\s*; (\w+)(?:\s*; (\w+))?/ =~ line
|
||||
if $1 == 'gc'
|
||||
data[$3] = data[$2]
|
||||
data[$4] = data[$2]
|
||||
kv[normalize_propname($3)] = normalize_propname($2)
|
||||
kv[normalize_propname($4)] = normalize_propname($2) if $4
|
||||
else
|
||||
data[$2] = data[$3]
|
||||
data[$4] = data[$3]
|
||||
kv[normalize_propname($2)] = normalize_propname($3)
|
||||
kv[normalize_propname($4)] = normalize_propname($3) if $4
|
||||
end
|
||||
@ -204,19 +170,26 @@ def parse_aliases
|
||||
kv
|
||||
end
|
||||
|
||||
$const_cache = {}
|
||||
# make_const(property, pairs, name): Prints a 'static const' structure for a
|
||||
# given property, group of paired codepoints, and a human-friendly name for
|
||||
# the group
|
||||
def make_const(prop, pairs, name)
|
||||
def make_const(prop, data, name)
|
||||
puts "\n/* '#{prop}': #{name} */"
|
||||
puts "static const OnigCodePoint CR_#{prop}[] = {"
|
||||
# The first element of the constant is the number of pairs of codepoints
|
||||
puts "\t#{pairs.size},"
|
||||
pairs.each do |pair|
|
||||
pair.map! { |c| c == 0 ? '0x0000' : sprintf("%0#6x", c) }
|
||||
puts "\t#{pair.first}, #{pair.last},"
|
||||
if origprop = $const_cache.key(data)
|
||||
puts "#define CR_#{prop} CR_#{origprop}"
|
||||
else
|
||||
$const_cache[prop] = data
|
||||
pairs = pair_codepoints(data)
|
||||
puts "static const OnigCodePoint CR_#{prop}[] = {"
|
||||
# The first element of the constant is the number of pairs of codepoints
|
||||
puts "\t#{pairs.size},"
|
||||
pairs.each do |pair|
|
||||
pair.map! { |c| c == 0 ? '0x0000' : sprintf("%0#6x", c) }
|
||||
puts "\t#{pair.first}, #{pair.last},"
|
||||
end
|
||||
puts "}; /* CR_#{prop} */"
|
||||
end
|
||||
puts "}; /* CR_#{prop} */"
|
||||
end
|
||||
|
||||
def normalize_propname(name)
|
||||
@ -233,9 +206,6 @@ end
|
||||
# Write Data
|
||||
puts '%{'
|
||||
props, data = parse_unicode_data(get_file('UnicodeData.txt'))
|
||||
POSIX_NAMES.each do |name|
|
||||
make_const(name, pair_codepoints(data[name]), "[[:#{name}:]]")
|
||||
end
|
||||
print "\n#ifdef USE_UNICODE_PROPERTIES"
|
||||
props.each do |name|
|
||||
category =
|
||||
@ -244,11 +214,16 @@ props.each do |name|
|
||||
when 2 then 'General Category'
|
||||
else '-'
|
||||
end
|
||||
make_const(name, pair_codepoints(data[name]), category)
|
||||
make_const(name, data[name], category)
|
||||
end
|
||||
props.concat parse_scripts(data)
|
||||
puts '#endif /* USE_UNICODE_PROPERTIES */'
|
||||
aliases = parse_aliases(data)
|
||||
define_posix_props(data)
|
||||
POSIX_NAMES.each do |name|
|
||||
make_const(name, data[name], "[[:#{name}:]]")
|
||||
end
|
||||
props.concat parse_scripts
|
||||
puts(<<'__HEREDOC')
|
||||
#endif /* USE_UNICODE_PROPERTIES */
|
||||
|
||||
static const OnigCodePoint* const CodeRanges[] = {
|
||||
__HEREDOC
|
||||
@ -283,7 +258,7 @@ props.each do |name|
|
||||
name_to_index[name] = i
|
||||
puts "%-40s %3d" % [name + ',', i]
|
||||
end
|
||||
parse_aliases.each_pair do |k, v|
|
||||
aliases.each_pair do |k, v|
|
||||
next if name_to_index[k]
|
||||
next unless v = name_to_index[v]
|
||||
puts "%-40s %3d" % [k + ',', v]
|
||||
|
Loading…
x
Reference in New Issue
Block a user