diff options
Diffstat (limited to 'activesupport/bin/generate_tables')
-rwxr-xr-x | activesupport/bin/generate_tables | 141 |
1 files changed, 141 insertions, 0 deletions
diff --git a/activesupport/bin/generate_tables b/activesupport/bin/generate_tables new file mode 100755 index 0000000000..18199b2171 --- /dev/null +++ b/activesupport/bin/generate_tables @@ -0,0 +1,141 @@ +#!/usr/bin/env ruby +# frozen_string_literal: true + +begin + $:.unshift(File.expand_path("../lib", __dir__)) + require "active_support" +rescue IOError +end + +require "open-uri" +require "tmpdir" +require "fileutils" + +module ActiveSupport + module Multibyte + module Unicode + class UnicodeDatabase + def load; end + end + + class DatabaseGenerator + BASE_URI = "http://www.unicode.org/Public/#{UNICODE_VERSION}/ucd/" + SOURCES = { + codepoints: BASE_URI + "UnicodeData.txt", + composition_exclusion: BASE_URI + "CompositionExclusions.txt", + grapheme_break_property: BASE_URI + "auxiliary/GraphemeBreakProperty.txt", + cp1252: "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP1252.TXT" + } + + def initialize + @ucd = Unicode::UnicodeDatabase.new + end + + def parse_codepoints(line) + codepoint = Codepoint.new + raise "Could not parse input." unless line =~ /^ + ([0-9A-F]+); # code + ([^;]+); # name + ([A-Z]+); # general category + ([0-9]+); # canonical combining class + ([A-Z]+); # bidi class + (<([A-Z]*)>)? # decomposition type + ((\ ?[0-9A-F]+)*); # decomposition mapping + ([0-9]*); # decimal digit + ([0-9]*); # digit + ([^;]*); # numeric + ([YN]*); # bidi mirrored + ([^;]*); # unicode 1.0 name + ([^;]*); # iso comment + ([0-9A-F]*); # simple uppercase mapping + ([0-9A-F]*); # simple lowercase mapping + ([0-9A-F]*)$/ix # simple titlecase mapping + codepoint.code = $1.hex + codepoint.combining_class = Integer($4) + codepoint.decomp_type = $7 + codepoint.decomp_mapping = ($8 == "") ? nil : $8.split.collect(&:hex) + codepoint.uppercase_mapping = ($16 == "") ? 0 : $16.hex + codepoint.lowercase_mapping = ($17 == "") ? 0 : $17.hex + @ucd.codepoints[codepoint.code] = codepoint + end + + def parse_grapheme_break_property(line) + if line =~ /^([0-9A-F.]+)\s*;\s*([\w]+)\s*#/ + type = $2.downcase.intern + @ucd.boundary[type] ||= [] + if $1.include? ".." + parts = $1.split ".." + @ucd.boundary[type] << (parts[0].hex..parts[1].hex) + else + @ucd.boundary[type] << $1.hex + end + end + end + + def parse_composition_exclusion(line) + if line =~ /^([0-9A-F]+)/i + @ucd.composition_exclusion << $1.hex + end + end + + def parse_cp1252(line) + if line =~ /^([0-9A-Fx]+)\s([0-9A-Fx]+)/i + @ucd.cp1252[$1.hex] = $2.hex + end + end + + def create_composition_map + @ucd.codepoints.each do |_, cp| + if !cp.nil? && cp.combining_class == 0 && cp.decomp_type.nil? && !cp.decomp_mapping.nil? && cp.decomp_mapping.length == 2 && @ucd.codepoints[cp.decomp_mapping[0]].combining_class == 0 && !@ucd.composition_exclusion.include?(cp.code) + @ucd.composition_map[cp.decomp_mapping[0]] ||= {} + @ucd.composition_map[cp.decomp_mapping[0]][cp.decomp_mapping[1]] = cp.code + end + end + end + + def normalize_boundary_map + @ucd.boundary.each do |k, v| + if [:lf, :cr].include? k + @ucd.boundary[k] = v[0] + end + end + end + + def parse + SOURCES.each do |type, url| + filename = File.join(Dir.tmpdir, UNICODE_VERSION, "#{url.split('/').last}") + unless File.exist?(filename) + $stderr.puts "Downloading #{url.split('/').last}" + FileUtils.mkdir_p(File.dirname(filename)) + File.open(filename, "wb") do |target| + open(url) do |source| + source.each_line { |line| target.write line } + end + end + end + File.open(filename) do |file| + file.each_line { |line| send "parse_#{type}".intern, line } + end + end + create_composition_map + normalize_boundary_map + end + + def dump_to(filename) + File.open(filename, "wb") do |f| + f.write Marshal.dump([@ucd.codepoints, @ucd.composition_exclusion, @ucd.composition_map, @ucd.boundary, @ucd.cp1252]) + end + end + end + end + end +end + +if __FILE__ == $0 + filename = ActiveSupport::Multibyte::Unicode::UnicodeDatabase.filename + generator = ActiveSupport::Multibyte::Unicode::DatabaseGenerator.new + generator.parse + print "Writing to: #{filename}" + generator.dump_to filename + puts " (#{File.size(filename)} bytes)" +end |