#!/usr/bin/env ruby begin $:.unshift(File.expand_path(File.dirname(__FILE__) + '/../lib')) require 'active_support' rescue IOError end require 'open-uri' require 'tmpdir' module ActiveSupport module Multibyte module Unicode class UnicodeDatabase def load; end end class DatabaseGenerator BASE_URI = "http://www.unicode.org/Public/#{UNICODE_VERSION}/ucd/" SOURCES = { :codepoints => BASE_URI + 'UnicodeData.txt', :composition_exclusion => BASE_URI + 'CompositionExclusions.txt', :grapheme_break_property => BASE_URI + 'auxiliary/GraphemeBreakProperty.txt', :cp1252 => 'http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP1252.TXT' } def initialize @ucd = Unicode::UnicodeDatabase.new default = Codepoint.new default.combining_class = 0 default.uppercase_mapping = 0 default.lowercase_mapping = 0 @ucd.codepoints = Hash.new(default) end def parse_codepoints(line) codepoint = Codepoint.new raise "Could not parse input." unless line =~ /^ ([0-9A-F]+); # code ([^;]+); # name ([A-Z]+); # general category ([0-9]+); # canonical combining class ([A-Z]+); # bidi class (<([A-Z]*)>)? # decomposition type ((\ ?[0-9A-F]+)*); # decomposition mapping ([0-9]*); # decimal digit ([0-9]*); # digit ([^;]*); # numeric ([YN]*); # bidi mirrored ([^;]*); # unicode 1.0 name ([^;]*); # iso comment ([0-9A-F]*); # simple uppercase mapping ([0-9A-F]*); # simple lowercase mapping ([0-9A-F]*)$/ix # simple titlecase mapping codepoint.code = $1.hex #codepoint.name = $2 #codepoint.category = $3 codepoint.combining_class = Integer($4) #codepoint.bidi_class = $5 codepoint.decomp_type = $7 codepoint.decomp_mapping = ($8=='') ? nil : $8.split.collect { |element| element.hex } #codepoint.bidi_mirrored = ($13=='Y') ? true : false codepoint.uppercase_mapping = ($16=='') ? 0 : $16.hex codepoint.lowercase_mapping = ($17=='') ? 0 : $17.hex #codepoint.titlecase_mapping = ($18=='') ? nil : $18.hex @ucd.codepoints[codepoint.code] = codepoint end def parse_grapheme_break_property(line) if line =~ /^([0-9A-F.]+)\s*;\s*([\w]+)\s*#/ type = $2.downcase.intern @ucd.boundary[type] ||= [] if $1.include? '..' parts = $1.split '..' @ucd.boundary[type] << (parts[0].hex..parts[1].hex) else @ucd.boundary[type] << $1.hex end end end def parse_composition_exclusion(line) if line =~ /^([0-9A-F]+)/i @ucd.composition_exclusion << $1.hex end end def parse_cp1252(line) if line =~ /^([0-9A-Fx]+)\s([0-9A-Fx]+)/i @ucd.cp1252[$1.hex] = $2.hex end end def create_composition_map @ucd.codepoints.each do |_, cp| if !cp.nil? and cp.combining_class == 0 and cp.decomp_type.nil? and !cp.decomp_mapping.nil? and cp.decomp_mapping.length == 2 and @ucd.codepoints[cp.decomp_mapping[0]].combining_class == 0 and !@ucd.composition_exclusion.include?(cp.code) @ucd.composition_map[cp.decomp_mapping[0]] ||= {} @ucd.composition_map[cp.decomp_mapping[0]][cp.decomp_mapping[1]] = cp.code end end end def normalize_boundary_map @ucd.boundary.each do |k,v| if [:lf, :cr].include? k @ucd.boundary[k] = v[0] end end end def parse SOURCES.each do |type, url| filename = File.join(Dir.tmpdir, "#{url.split('/').last}") unless File.exist?(filename) $stderr.puts "Downloading #{url.split('/').last}" File.open(filename, 'wb') do |target| open(url) do |source| source.each_line { |line| target.write line } end end end File.open(filename) do |file| file.each_line { |line| send "parse_#{type}".intern, line } end end create_composition_map normalize_boundary_map end def dump_to(filename) File.open(filename, 'wb') do |f| f.write Marshal.dump([@ucd.codepoints, @ucd.composition_exclusion, @ucd.composition_map, @ucd.boundary, @ucd.cp1252]) end end end end end end if __FILE__ == $0 filename = ActiveSupport::Multibyte::Unicode::UnicodeDatabase.filename generator = ActiveSupport::Multibyte::Unicode::DatabaseGenerator.new generator.parse print "Writing to: #{filename}" generator.dump_to filename puts " (#{File.size(filename)} bytes)" end