From 22f75d539dca7b6f33cbf86e4e9d1944bb22731f Mon Sep 17 00:00:00 2001 From: Manfred Stienstra Date: Sun, 21 Sep 2008 17:21:30 +0200 Subject: Simplify ActiveSupport::Multibyte and make it run on Ruby 1.9. * Unicode methods are now defined directly on Chars instead of a handler * Updated Unicode database to Unicode 5.1.0 * Improved documentation --- activesupport/bin/generate_tables | 147 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 147 insertions(+) create mode 100755 activesupport/bin/generate_tables (limited to 'activesupport/bin') diff --git a/activesupport/bin/generate_tables b/activesupport/bin/generate_tables new file mode 100755 index 0000000000..f8e032139f --- /dev/null +++ b/activesupport/bin/generate_tables @@ -0,0 +1,147 @@ +#!/usr/bin/env ruby + +begin + $:.unshift(File.expand_path(File.dirname(__FILE__) + '/../lib')) + require 'active_support' +rescue IOError +end + +require 'open-uri' +require 'tmpdir' + +module ActiveSupport + module Multibyte + class UnicodeDatabase + def load; end + end + + class UnicodeDatabaseGenerator + BASE_URI = "http://www.unicode.org/Public/#{ActiveSupport::Multibyte::UNICODE_VERSION}/ucd/" + SOURCES = { + :codepoints => BASE_URI + 'UnicodeData.txt', + :composition_exclusion => BASE_URI + 'CompositionExclusions.txt', + :grapheme_break_property => BASE_URI + 'auxiliary/GraphemeBreakProperty.txt', + :cp1252 => 'http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP1252.TXT' + } + + def initialize + @ucd = UnicodeDatabase.new + + default = Codepoint.new + default.combining_class = 0 + default.uppercase_mapping = 0 + default.lowercase_mapping = 0 + @ucd.codepoints = Hash.new(default) + end + + def parse_codepoints(line) + codepoint = Codepoint.new + raise "Could not parse input." unless line =~ /^ + ([0-9A-F]+); # code + ([^;]+); # name + ([A-Z]+); # general category + ([0-9]+); # canonical combining class + ([A-Z]+); # bidi class + (<([A-Z]*)>)? # decomposition type + ((\ ?[0-9A-F]+)*); # decompomposition mapping + ([0-9]*); # decimal digit + ([0-9]*); # digit + ([^;]*); # numeric + ([YN]*); # bidi mirrored + ([^;]*); # unicode 1.0 name + ([^;]*); # iso comment + ([0-9A-F]*); # simple uppercase mapping + ([0-9A-F]*); # simple lowercase mapping + ([0-9A-F]*)$/ix # simple titlecase mapping + codepoint.code = $1.hex + #codepoint.name = $2 + #codepoint.category = $3 + codepoint.combining_class = Integer($4) + #codepoint.bidi_class = $5 + codepoint.decomp_type = $7 + codepoint.decomp_mapping = ($8=='') ? nil : $8.split.collect { |element| element.hex } + #codepoint.bidi_mirrored = ($13=='Y') ? true : false + codepoint.uppercase_mapping = ($16=='') ? 0 : $16.hex + codepoint.lowercase_mapping = ($17=='') ? 0 : $17.hex + #codepoint.titlecase_mapping = ($18=='') ? nil : $18.hex + @ucd.codepoints[codepoint.code] = codepoint + end + + def parse_grapheme_break_property(line) + if line =~ /^([0-9A-F\.]+)\s*;\s*([\w]+)\s*#/ + type = $2.downcase.intern + @ucd.boundary[type] ||= [] + if $1.include? '..' + parts = $1.split '..' + @ucd.boundary[type] << (parts[0].hex..parts[1].hex) + else + @ucd.boundary[type] << $1.hex + end + end + end + + def parse_composition_exclusion(line) + if line =~ /^([0-9A-F]+)/i + @ucd.composition_exclusion << $1.hex + end + end + + def parse_cp1252(line) + if line =~ /^([0-9A-Fx]+)\s([0-9A-Fx]+)/i + @ucd.cp1252[$1.hex] = $2.hex + end + end + + def create_composition_map + @ucd.codepoints.each do |_, cp| + if !cp.nil? and cp.combining_class == 0 and cp.decomp_type.nil? and !cp.decomp_mapping.nil? and cp.decomp_mapping.length == 2 and @ucd.codepoints[cp.decomp_mapping[0]].combining_class == 0 and !@ucd.composition_exclusion.include?(cp.code) + @ucd.composition_map[cp.decomp_mapping[0]] ||= {} + @ucd.composition_map[cp.decomp_mapping[0]][cp.decomp_mapping[1]] = cp.code + end + end + end + + def normalize_boundary_map + @ucd.boundary.each do |k,v| + if [:lf, :cr].include? k + @ucd.boundary[k] = v[0] + end + end + end + + def parse + SOURCES.each do |type, url| + filename = File.join(Dir.tmpdir, "#{url.split('/').last}") + unless File.exist?(filename) + $stderr.puts "Downloading #{url.split('/').last}" + File.open(filename, 'wb') do |target| + open(url) do |source| + source.each_line { |line| target.write line } + end + end + end + File.open(filename) do |file| + file.each_line { |line| send "parse_#{type}".intern, line } + end + end + create_composition_map + normalize_boundary_map + end + + def dump_to(filename) + File.open(filename, 'wb') do |f| + f.write Marshal.dump([@ucd.codepoints, @ucd.composition_exclusion, @ucd.composition_map, @ucd.boundary, @ucd.cp1252]) + end + end + end + end +end + +if __FILE__ == $0 + filename = ActiveSupport::Multibyte::UnicodeDatabase.filename + generator = ActiveSupport::Multibyte::UnicodeDatabaseGenerator.new + generator.parse + print "Writing to: #{filename}" + generator.dump_to filename + puts " (#{File.size(filename)} bytes)" +end -- cgit v1.2.3