1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
|
#!/usr/bin/env ruby
begin
$:.unshift(File.expand_path(File.dirname(__FILE__) + "/../lib"))
require "active_support"
rescue IOError
end
require "open-uri"
require "tmpdir"
require "fileutils"
module ActiveSupport
module Multibyte
module Unicode
class UnicodeDatabase
def load; end
end
class DatabaseGenerator
BASE_URI = "http://www.unicode.org/Public/#{UNICODE_VERSION}/ucd/"
SOURCES = {
codepoints: BASE_URI + "UnicodeData.txt",
composition_exclusion: BASE_URI + "CompositionExclusions.txt",
grapheme_break_property: BASE_URI + "auxiliary/GraphemeBreakProperty.txt",
cp1252: "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP1252.TXT"
}
def initialize
@ucd = Unicode::UnicodeDatabase.new
end
def parse_codepoints(line)
codepoint = Codepoint.new
raise "Could not parse input." unless line =~ /^
([0-9A-F]+); # code
([^;]+); # name
([A-Z]+); # general category
([0-9]+); # canonical combining class
([A-Z]+); # bidi class
(<([A-Z]*)>)? # decomposition type
((\ ?[0-9A-F]+)*); # decomposition mapping
([0-9]*); # decimal digit
([0-9]*); # digit
([^;]*); # numeric
([YN]*); # bidi mirrored
([^;]*); # unicode 1.0 name
([^;]*); # iso comment
([0-9A-F]*); # simple uppercase mapping
([0-9A-F]*); # simple lowercase mapping
([0-9A-F]*)$/ix # simple titlecase mapping
codepoint.code = $1.hex
codepoint.combining_class = Integer($4)
codepoint.decomp_type = $7
codepoint.decomp_mapping = ($8 == "") ? nil : $8.split.collect(&:hex)
codepoint.uppercase_mapping = ($16 == "") ? 0 : $16.hex
codepoint.lowercase_mapping = ($17 == "") ? 0 : $17.hex
@ucd.codepoints[codepoint.code] = codepoint
end
def parse_grapheme_break_property(line)
if line =~ /^([0-9A-F.]+)\s*;\s*([\w]+)\s*#/
type = $2.downcase.intern
@ucd.boundary[type] ||= []
if $1.include? ".."
parts = $1.split ".."
@ucd.boundary[type] << (parts[0].hex..parts[1].hex)
else
@ucd.boundary[type] << $1.hex
end
end
end
def parse_composition_exclusion(line)
if line =~ /^([0-9A-F]+)/i
@ucd.composition_exclusion << $1.hex
end
end
def parse_cp1252(line)
if line =~ /^([0-9A-Fx]+)\s([0-9A-Fx]+)/i
@ucd.cp1252[$1.hex] = $2.hex
end
end
def create_composition_map
@ucd.codepoints.each do |_, cp|
if !cp.nil? && cp.combining_class == 0 && cp.decomp_type.nil? && !cp.decomp_mapping.nil? && cp.decomp_mapping.length == 2 && @ucd.codepoints[cp.decomp_mapping[0]].combining_class == 0 && !@ucd.composition_exclusion.include?(cp.code)
@ucd.composition_map[cp.decomp_mapping[0]] ||= {}
@ucd.composition_map[cp.decomp_mapping[0]][cp.decomp_mapping[1]] = cp.code
end
end
end
def normalize_boundary_map
@ucd.boundary.each do |k, v|
if [:lf, :cr].include? k
@ucd.boundary[k] = v[0]
end
end
end
def parse
SOURCES.each do |type, url|
filename = File.join(Dir.tmpdir, UNICODE_VERSION, "#{url.split('/').last}")
unless File.exist?(filename)
$stderr.puts "Downloading #{url.split('/').last}"
FileUtils.mkdir_p(File.dirname(filename))
File.open(filename, "wb") do |target|
open(url) do |source|
source.each_line { |line| target.write line }
end
end
end
File.open(filename) do |file|
file.each_line { |line| send "parse_#{type}".intern, line }
end
end
create_composition_map
normalize_boundary_map
end
def dump_to(filename)
File.open(filename, "wb") do |f|
f.write Marshal.dump([@ucd.codepoints, @ucd.composition_exclusion, @ucd.composition_map, @ucd.boundary, @ucd.cp1252])
end
end
end
end
end
end
if __FILE__ == $0
filename = ActiveSupport::Multibyte::Unicode::UnicodeDatabase.filename
generator = ActiveSupport::Multibyte::Unicode::DatabaseGenerator.new
generator.parse
print "Writing to: #{filename}"
generator.dump_to filename
puts " (#{File.size(filename)} bytes)"
end
|