activesupport/bin/generate_tables


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140

#!/usr/bin/env ruby

begin
  $:.unshift(File.expand_path(File.dirname(__FILE__) + "/../lib"))
  require "active_support"
rescue IOError
end

require "open-uri"
require "tmpdir"
require "fileutils"

module ActiveSupport
  module Multibyte
    module Unicode
      class UnicodeDatabase
        def load; end
      end

      class DatabaseGenerator
        BASE_URI = "http://www.unicode.org/Public/#{UNICODE_VERSION}/ucd/"
        SOURCES = {
          codepoints: BASE_URI + "UnicodeData.txt",
          composition_exclusion: BASE_URI + "CompositionExclusions.txt",
          grapheme_break_property: BASE_URI + "auxiliary/GraphemeBreakProperty.txt",
          cp1252: "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP1252.TXT"
        }

        def initialize
          @ucd = Unicode::UnicodeDatabase.new
        end

        def parse_codepoints(line)
          codepoint = Codepoint.new
          raise "Could not parse input." unless line =~ /^
            ([0-9A-F]+);        # code
            ([^;]+);            # name
            ([A-Z]+);           # general category
            ([0-9]+);           # canonical combining class
            ([A-Z]+);           # bidi class
            (<([A-Z]*)>)?       # decomposition type
            ((\ ?[0-9A-F]+)*);  # decomposition mapping
            ([0-9]*);           # decimal digit
            ([0-9]*);           # digit
            ([^;]*);            # numeric
            ([YN]*);            # bidi mirrored
            ([^;]*);            # unicode 1.0 name
            ([^;]*);            # iso comment
            ([0-9A-F]*);        # simple uppercase mapping
            ([0-9A-F]*);        # simple lowercase mapping
            ([0-9A-F]*)$/ix     # simple titlecase mapping
          codepoint.code              = $1.hex
          codepoint.combining_class   = Integer($4)
          codepoint.decomp_type       = $7
          codepoint.decomp_mapping    = ($8 == "") ? nil : $8.split.collect(&:hex)
          codepoint.uppercase_mapping = ($16 == "") ? 0 : $16.hex
          codepoint.lowercase_mapping = ($17 == "") ? 0 : $17.hex
          @ucd.codepoints[codepoint.code] = codepoint
        end

        def parse_grapheme_break_property(line)
          if line =~ /^([0-9A-F.]+)\s*;\s*([\w]+)\s*#/
            type = $2.downcase.intern
            @ucd.boundary[type] ||= []
            if $1.include? ".."
              parts = $1.split ".."
              @ucd.boundary[type] << (parts[0].hex..parts[1].hex)
            else
              @ucd.boundary[type] << $1.hex
            end
          end
        end

        def parse_composition_exclusion(line)
          if line =~ /^([0-9A-F]+)/i
            @ucd.composition_exclusion << $1.hex
          end
        end

        def parse_cp1252(line)
          if line =~ /^([0-9A-Fx]+)\s([0-9A-Fx]+)/i
            @ucd.cp1252[$1.hex] = $2.hex
          end
        end

        def create_composition_map
          @ucd.codepoints.each do |_, cp|
            if !cp.nil? && cp.combining_class == 0 && cp.decomp_type.nil? && !cp.decomp_mapping.nil? && cp.decomp_mapping.length == 2 && @ucd.codepoints[cp.decomp_mapping[0]].combining_class == 0 && !@ucd.composition_exclusion.include?(cp.code)
              @ucd.composition_map[cp.decomp_mapping[0]] ||= {}
              @ucd.composition_map[cp.decomp_mapping[0]][cp.decomp_mapping[1]] = cp.code
            end
          end
        end

        def normalize_boundary_map
          @ucd.boundary.each do |k, v|
            if [:lf, :cr].include? k
              @ucd.boundary[k] = v[0]
            end
          end
        end

        def parse
          SOURCES.each do |type, url|
            filename = File.join(Dir.tmpdir, UNICODE_VERSION, "#{url.split('/').last}")
            unless File.exist?(filename)
              $stderr.puts "Downloading #{url.split('/').last}"
              FileUtils.mkdir_p(File.dirname(filename))
              File.open(filename, "wb") do |target|
                open(url) do |source|
                  source.each_line { |line| target.write line }
                end
              end
            end
            File.open(filename) do |file|
              file.each_line { |line| send "parse_#{type}".intern, line }
            end
          end
          create_composition_map
          normalize_boundary_map
        end

        def dump_to(filename)
          File.open(filename, "wb") do |f|
            f.write Marshal.dump([@ucd.codepoints, @ucd.composition_exclusion, @ucd.composition_map, @ucd.boundary, @ucd.cp1252])
          end
        end
      end
    end
  end
end

if __FILE__ == $0
  filename = ActiveSupport::Multibyte::Unicode::UnicodeDatabase.filename
  generator = ActiveSupport::Multibyte::Unicode::DatabaseGenerator.new
  generator.parse
  print "Writing to: #{filename}"
  generator.dump_to filename
  puts " (#{File.size(filename)} bytes)"
end