aboutsummaryrefslogtreecommitdiffstats
path: root/activesupport/bin/generate_tables
blob: f8e032139f4ae23ecb085c46c51fe07cbf9d55d0 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
#!/usr/bin/env ruby

begin
  $:.unshift(File.expand_path(File.dirname(__FILE__) + '/../lib'))
  require 'active_support'
rescue IOError
end

require 'open-uri'
require 'tmpdir'

module ActiveSupport
  module Multibyte
    class UnicodeDatabase
      def load; end
    end
    
    class UnicodeDatabaseGenerator
      BASE_URI = "http://www.unicode.org/Public/#{ActiveSupport::Multibyte::UNICODE_VERSION}/ucd/"
      SOURCES = {
        :codepoints => BASE_URI + 'UnicodeData.txt',
        :composition_exclusion => BASE_URI + 'CompositionExclusions.txt',
        :grapheme_break_property => BASE_URI + 'auxiliary/GraphemeBreakProperty.txt',
        :cp1252 => 'http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP1252.TXT'
      }

      def initialize
        @ucd = UnicodeDatabase.new

        default = Codepoint.new
        default.combining_class = 0
        default.uppercase_mapping = 0
        default.lowercase_mapping = 0
        @ucd.codepoints = Hash.new(default)
      end

      def parse_codepoints(line)
        codepoint = Codepoint.new
        raise "Could not parse input." unless line =~ /^
          ([0-9A-F]+);        # code
          ([^;]+);            # name
          ([A-Z]+);           # general category
          ([0-9]+);           # canonical combining class
          ([A-Z]+);           # bidi class
          (<([A-Z]*)>)?       # decomposition type
          ((\ ?[0-9A-F]+)*);  # decompomposition mapping
          ([0-9]*);           # decimal digit
          ([0-9]*);           # digit
          ([^;]*);            # numeric
          ([YN]*);            # bidi mirrored
          ([^;]*);            # unicode 1.0 name
          ([^;]*);            # iso comment
          ([0-9A-F]*);        # simple uppercase mapping
          ([0-9A-F]*);        # simple lowercase mapping
          ([0-9A-F]*)$/ix     # simple titlecase mapping
        codepoint.code              = $1.hex
        #codepoint.name              = $2
        #codepoint.category          = $3
        codepoint.combining_class   = Integer($4)
        #codepoint.bidi_class        = $5
        codepoint.decomp_type       = $7
        codepoint.decomp_mapping    = ($8=='') ? nil : $8.split.collect { |element| element.hex }
        #codepoint.bidi_mirrored     = ($13=='Y') ? true : false
        codepoint.uppercase_mapping = ($16=='') ? 0 : $16.hex
        codepoint.lowercase_mapping = ($17=='') ? 0 : $17.hex
        #codepoint.titlecase_mapping = ($18=='') ? nil : $18.hex
        @ucd.codepoints[codepoint.code] = codepoint
      end

      def parse_grapheme_break_property(line)
        if line =~ /^([0-9A-F\.]+)\s*;\s*([\w]+)\s*#/
          type = $2.downcase.intern
          @ucd.boundary[type] ||= []
          if $1.include? '..'
            parts = $1.split '..'
            @ucd.boundary[type] << (parts[0].hex..parts[1].hex)
          else
            @ucd.boundary[type] << $1.hex
          end
        end
      end

      def parse_composition_exclusion(line)
        if line =~ /^([0-9A-F]+)/i
          @ucd.composition_exclusion << $1.hex
        end
      end

      def parse_cp1252(line)
        if line =~ /^([0-9A-Fx]+)\s([0-9A-Fx]+)/i
          @ucd.cp1252[$1.hex] = $2.hex
        end
      end

      def create_composition_map
        @ucd.codepoints.each do |_, cp|
          if !cp.nil? and cp.combining_class == 0 and cp.decomp_type.nil? and !cp.decomp_mapping.nil? and cp.decomp_mapping.length == 2 and @ucd.codepoints[cp.decomp_mapping[0]].combining_class == 0 and !@ucd.composition_exclusion.include?(cp.code)
            @ucd.composition_map[cp.decomp_mapping[0]] ||= {}
            @ucd.composition_map[cp.decomp_mapping[0]][cp.decomp_mapping[1]] = cp.code
          end
        end
      end

      def normalize_boundary_map
        @ucd.boundary.each do |k,v|
          if [:lf, :cr].include? k
            @ucd.boundary[k] = v[0]
          end
        end
      end

      def parse
        SOURCES.each do |type, url|
          filename =  File.join(Dir.tmpdir, "#{url.split('/').last}")
          unless File.exist?(filename)
            $stderr.puts "Downloading #{url.split('/').last}"
            File.open(filename, 'wb') do |target|
              open(url) do |source|
                source.each_line { |line| target.write line }
              end
            end
          end
          File.open(filename) do |file|
            file.each_line { |line| send "parse_#{type}".intern, line }
          end
        end
        create_composition_map
        normalize_boundary_map
      end

      def dump_to(filename)
        File.open(filename, 'wb') do |f|
          f.write Marshal.dump([@ucd.codepoints, @ucd.composition_exclusion, @ucd.composition_map, @ucd.boundary, @ucd.cp1252])
        end
      end
    end
  end
end

if __FILE__ == $0
  filename = ActiveSupport::Multibyte::UnicodeDatabase.filename
  generator = ActiveSupport::Multibyte::UnicodeDatabaseGenerator.new
  generator.parse
  print "Writing to: #{filename}"
  generator.dump_to filename
  puts " (#{File.size(filename)} bytes)"
end