* tool/enc-unicode.rb: optimized.
* enc/unicode/name2ctype.h, enc/unicode/name2ctype.h.blt, enc/unicode/name2ctype.kwd, enc/unicode/name2ctype.src: U+100000-U+10FFFD is assigned, not Cn. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@25271 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
This commit is contained in:
parent
f0cdcf6f4e
commit
5a4ce608e2
@ -1,3 +1,11 @@
|
|||||||
|
Fri Oct 9 02:58:18 2009 NARUSE, Yui <naruse@ruby-lang.org>
|
||||||
|
|
||||||
|
* tool/enc-unicode.rb: optimized.
|
||||||
|
|
||||||
|
* enc/unicode/name2ctype.h, enc/unicode/name2ctype.h.blt,
|
||||||
|
enc/unicode/name2ctype.kwd, enc/unicode/name2ctype.src:
|
||||||
|
U+100000-U+10FFFD is assigned, not Cn.
|
||||||
|
|
||||||
Fri Oct 9 02:12:02 2009 Marc-Andre Lafortune <ruby-core@marc-andre.ca>
|
Fri Oct 9 02:12:02 2009 Marc-Andre Lafortune <ruby-core@marc-andre.ca>
|
||||||
|
|
||||||
* ext/curses/curses.c: Many functions of module Curses could cause a
|
* ext/curses/curses.c: Many functions of module Curses could cause a
|
||||||
|
@ -3959,7 +3959,7 @@ static const OnigCodePoint CR_Any[] = {
|
|||||||
|
|
||||||
/* 'Assigned': - */
|
/* 'Assigned': - */
|
||||||
static const OnigCodePoint CR_Assigned[] = {
|
static const OnigCodePoint CR_Assigned[] = {
|
||||||
484,
|
485,
|
||||||
0x0000, 0x0377,
|
0x0000, 0x0377,
|
||||||
0x037a, 0x037e,
|
0x037a, 0x037e,
|
||||||
0x0384, 0x038a,
|
0x0384, 0x038a,
|
||||||
@ -4444,6 +4444,7 @@ static const OnigCodePoint CR_Assigned[] = {
|
|||||||
0xe0020, 0xe007f,
|
0xe0020, 0xe007f,
|
||||||
0xe0100, 0xe01ef,
|
0xe0100, 0xe01ef,
|
||||||
0xf0000, 0xffffd,
|
0xf0000, 0xffffd,
|
||||||
|
0x100000, 0x10fffd,
|
||||||
}; /* CR_Assigned */
|
}; /* CR_Assigned */
|
||||||
|
|
||||||
/* 'C': Major Category */
|
/* 'C': Major Category */
|
||||||
@ -4500,7 +4501,7 @@ static const OnigCodePoint CR_Cf[] = {
|
|||||||
|
|
||||||
/* 'Cn': General Category */
|
/* 'Cn': General Category */
|
||||||
static const OnigCodePoint CR_Cn[] = {
|
static const OnigCodePoint CR_Cn[] = {
|
||||||
484,
|
485,
|
||||||
0x0378, 0x0379,
|
0x0378, 0x0379,
|
||||||
0x037f, 0x0383,
|
0x037f, 0x0383,
|
||||||
0x038b, 0x038b,
|
0x038b, 0x038b,
|
||||||
@ -4984,7 +4985,8 @@ static const OnigCodePoint CR_Cn[] = {
|
|||||||
0xe0002, 0xe001f,
|
0xe0002, 0xe001f,
|
||||||
0xe0080, 0xe00ff,
|
0xe0080, 0xe00ff,
|
||||||
0xe01f0, 0xeffff,
|
0xe01f0, 0xeffff,
|
||||||
0xffffe, 0x10ffff,
|
0xffffe, 0xfffff,
|
||||||
|
0x10fffe, 0x10ffff,
|
||||||
}; /* CR_Cn */
|
}; /* CR_Cn */
|
||||||
|
|
||||||
/* 'Co': General Category */
|
/* 'Co': General Category */
|
||||||
|
@ -3923,7 +3923,7 @@ static const OnigCodePoint CR_Any[] = {
|
|||||||
|
|
||||||
/* 'Assigned': - */
|
/* 'Assigned': - */
|
||||||
static const OnigCodePoint CR_Assigned[] = {
|
static const OnigCodePoint CR_Assigned[] = {
|
||||||
484,
|
485,
|
||||||
0x0000, 0x0377,
|
0x0000, 0x0377,
|
||||||
0x037a, 0x037e,
|
0x037a, 0x037e,
|
||||||
0x0384, 0x038a,
|
0x0384, 0x038a,
|
||||||
@ -4408,6 +4408,7 @@ static const OnigCodePoint CR_Assigned[] = {
|
|||||||
0xe0020, 0xe007f,
|
0xe0020, 0xe007f,
|
||||||
0xe0100, 0xe01ef,
|
0xe0100, 0xe01ef,
|
||||||
0xf0000, 0xffffd,
|
0xf0000, 0xffffd,
|
||||||
|
0x100000, 0x10fffd,
|
||||||
}; /* CR_Assigned */
|
}; /* CR_Assigned */
|
||||||
|
|
||||||
/* 'C': Major Category */
|
/* 'C': Major Category */
|
||||||
@ -4464,7 +4465,7 @@ static const OnigCodePoint CR_Cf[] = {
|
|||||||
|
|
||||||
/* 'Cn': General Category */
|
/* 'Cn': General Category */
|
||||||
static const OnigCodePoint CR_Cn[] = {
|
static const OnigCodePoint CR_Cn[] = {
|
||||||
484,
|
485,
|
||||||
0x0378, 0x0379,
|
0x0378, 0x0379,
|
||||||
0x037f, 0x0383,
|
0x037f, 0x0383,
|
||||||
0x038b, 0x038b,
|
0x038b, 0x038b,
|
||||||
@ -4948,7 +4949,8 @@ static const OnigCodePoint CR_Cn[] = {
|
|||||||
0xe0002, 0xe001f,
|
0xe0002, 0xe001f,
|
||||||
0xe0080, 0xe00ff,
|
0xe0080, 0xe00ff,
|
||||||
0xe01f0, 0xeffff,
|
0xe01f0, 0xeffff,
|
||||||
0xffffe, 0x10ffff,
|
0xffffe, 0xfffff,
|
||||||
|
0x10fffe, 0x10ffff,
|
||||||
}; /* CR_Cn */
|
}; /* CR_Cn */
|
||||||
|
|
||||||
/* 'Co': General Category */
|
/* 'Co': General Category */
|
||||||
|
@ -3923,7 +3923,7 @@ static const OnigCodePoint CR_Any[] = {
|
|||||||
|
|
||||||
/* 'Assigned': - */
|
/* 'Assigned': - */
|
||||||
static const OnigCodePoint CR_Assigned[] = {
|
static const OnigCodePoint CR_Assigned[] = {
|
||||||
484,
|
485,
|
||||||
0x0000, 0x0377,
|
0x0000, 0x0377,
|
||||||
0x037a, 0x037e,
|
0x037a, 0x037e,
|
||||||
0x0384, 0x038a,
|
0x0384, 0x038a,
|
||||||
@ -4408,6 +4408,7 @@ static const OnigCodePoint CR_Assigned[] = {
|
|||||||
0xe0020, 0xe007f,
|
0xe0020, 0xe007f,
|
||||||
0xe0100, 0xe01ef,
|
0xe0100, 0xe01ef,
|
||||||
0xf0000, 0xffffd,
|
0xf0000, 0xffffd,
|
||||||
|
0x100000, 0x10fffd,
|
||||||
}; /* CR_Assigned */
|
}; /* CR_Assigned */
|
||||||
|
|
||||||
/* 'C': Major Category */
|
/* 'C': Major Category */
|
||||||
@ -4464,7 +4465,7 @@ static const OnigCodePoint CR_Cf[] = {
|
|||||||
|
|
||||||
/* 'Cn': General Category */
|
/* 'Cn': General Category */
|
||||||
static const OnigCodePoint CR_Cn[] = {
|
static const OnigCodePoint CR_Cn[] = {
|
||||||
484,
|
485,
|
||||||
0x0378, 0x0379,
|
0x0378, 0x0379,
|
||||||
0x037f, 0x0383,
|
0x037f, 0x0383,
|
||||||
0x038b, 0x038b,
|
0x038b, 0x038b,
|
||||||
@ -4948,7 +4949,8 @@ static const OnigCodePoint CR_Cn[] = {
|
|||||||
0xe0002, 0xe001f,
|
0xe0002, 0xe001f,
|
||||||
0xe0080, 0xe00ff,
|
0xe0080, 0xe00ff,
|
||||||
0xe01f0, 0xeffff,
|
0xe01f0, 0xeffff,
|
||||||
0xffffe, 0x10ffff,
|
0xffffe, 0xfffff,
|
||||||
|
0x10fffe, 0x10ffff,
|
||||||
}; /* CR_Cn */
|
}; /* CR_Cn */
|
||||||
|
|
||||||
/* 'Co': General Category */
|
/* 'Co': General Category */
|
||||||
|
@ -2,6 +2,13 @@
|
|||||||
|
|
||||||
# Creates the data structures needed by Onigurma to map Unicode codepoints to
|
# Creates the data structures needed by Onigurma to map Unicode codepoints to
|
||||||
# property names and POSIX character classes
|
# property names and POSIX character classes
|
||||||
|
#
|
||||||
|
# To use this, get UnicodeData.txt and Scripts.txt from unicode.org.
|
||||||
|
# (http://unicode.org/Public/UNIDATA/)
|
||||||
|
# And run following command.
|
||||||
|
# ruby1.9 tool/enc-unicode.rb UnicodeData.txt Scripts.txt > enc/unicode/name2ctype.kwd
|
||||||
|
# You can get source file for gperf.
|
||||||
|
# After this, simply make ruby.
|
||||||
|
|
||||||
unless ARGV.size == 2
|
unless ARGV.size == 2
|
||||||
$stderr.puts "Usage: #{$0} UnicodeData.txt Scripts.txt"
|
$stderr.puts "Usage: #{$0} UnicodeData.txt Scripts.txt"
|
||||||
@ -17,10 +24,11 @@ def pair_codepoints(codepoints)
|
|||||||
# codepoints with property _property_. Note: It is intended that some ranges
|
# codepoints with property _property_. Note: It is intended that some ranges
|
||||||
# will begin with the value with which they end, e.g. 0x0020 -> 0x0020
|
# will begin with the value with which they end, e.g. 0x0020 -> 0x0020
|
||||||
|
|
||||||
codepoints = codepoints.uniq.sort
|
codepoints.sort!
|
||||||
last_cp = codepoints.first
|
last_cp = codepoints.first
|
||||||
pairs = [[last_cp, nil]]
|
pairs = [[last_cp, nil]]
|
||||||
codepoints[1..-1].each do |codepoint|
|
codepoints[1..-1].each do |codepoint|
|
||||||
|
next if last_cp == codepoint
|
||||||
|
|
||||||
# If the current codepoint does not follow directly on from the last
|
# If the current codepoint does not follow directly on from the last
|
||||||
# codepoint, the last codepoint represents the end of the current range,
|
# codepoint, the last codepoint represents the end of the current range,
|
||||||
@ -39,7 +47,7 @@ end
|
|||||||
|
|
||||||
def parse_unicode_data(file)
|
def parse_unicode_data(file)
|
||||||
last_cp = 0
|
last_cp = 0
|
||||||
data = {'Cn' => []}
|
data = {'Any' => [], 'Assigned' => [], 'Cn' => []}
|
||||||
beg_cp = nil
|
beg_cp = nil
|
||||||
IO.foreach(file) do |line|
|
IO.foreach(file) do |line|
|
||||||
fields = line.split(';')
|
fields = line.split(';')
|
||||||
@ -64,6 +72,10 @@ def parse_unicode_data(file)
|
|||||||
# Cn category.
|
# Cn category.
|
||||||
data['Cn'].concat((last_cp.next...beg_cp).to_a)
|
data['Cn'].concat((last_cp.next...beg_cp).to_a)
|
||||||
|
|
||||||
|
# Assigned - Defined in unicode.c; interpreted as every character in the
|
||||||
|
# Unicode range minus the unassigned characters
|
||||||
|
data['Assigned'].concat(cps)
|
||||||
|
|
||||||
# The third field denotes the 'General' category, e.g. Lu
|
# The third field denotes the 'General' category, e.g. Lu
|
||||||
(data[fields[2]] ||= []).concat(cps)
|
(data[fields[2]] ||= []).concat(cps)
|
||||||
|
|
||||||
@ -73,16 +85,15 @@ def parse_unicode_data(file)
|
|||||||
last_cp = cp
|
last_cp = cp
|
||||||
end
|
end
|
||||||
|
|
||||||
# General Category property
|
|
||||||
gcps = %w[Any Assigned]
|
|
||||||
gcps.concat data.keys.sort
|
|
||||||
|
|
||||||
# The last Cn codepoint should be 0x10ffff. If it's not, append the missing
|
# The last Cn codepoint should be 0x10ffff. If it's not, append the missing
|
||||||
# codepoints to Cn and C
|
# codepoints to Cn and C
|
||||||
cn_remainder = (data['Cn'].last.next..0x10ffff).to_a
|
cn_remainder = (last_cp.next..0x10ffff).to_a
|
||||||
data['Cn'] += cn_remainder
|
data['Cn'] += cn_remainder
|
||||||
data['C'] += cn_remainder
|
data['C'] += cn_remainder
|
||||||
|
|
||||||
|
# Define General Category properties
|
||||||
|
gcps = data.keys.sort
|
||||||
|
|
||||||
# We now derive the character classes (POSIX brackets), e.g. [[:alpha:]]
|
# We now derive the character classes (POSIX brackets), e.g. [[:alpha:]]
|
||||||
#
|
#
|
||||||
|
|
||||||
@ -145,10 +156,6 @@ def parse_unicode_data(file)
|
|||||||
# Any - Defined in unicode.c
|
# Any - Defined in unicode.c
|
||||||
data['Any'] = (0x0000..0x10ffff).to_a
|
data['Any'] = (0x0000..0x10ffff).to_a
|
||||||
|
|
||||||
# Assigned - Defined in unicode.c; interpreted as every character in the
|
|
||||||
# Unicode range minus the unassigned characters
|
|
||||||
data['Assigned'] = data['Any'] - data['Cn']
|
|
||||||
|
|
||||||
# Returns General Category Property names and the data
|
# Returns General Category Property names and the data
|
||||||
[gcps, data]
|
[gcps, data]
|
||||||
end
|
end
|
||||||
|
Loading…
x
Reference in New Issue
Block a user