From f3abc8ac36055afed9fcc902c33ee146e066d17a Mon Sep 17 00:00:00 2001 From: Norman Clarke Date: Mon, 10 May 2010 10:46:37 -0300 Subject: Use multibyte proxy class on 1.9, refactor Unicode. Makes String#mb_chars on Ruby 1.9 return an instance of ActiveSupport::Multibyte::Chars to work around 1.9's lack of Unicode case folding. Refactors class methods from ActiveSupport::Multibyte::Chars into new Unicode module, adding other related functionality for consistency. [#4594 state:resolved] Signed-off-by: Jeremy Kemper --- activesupport/bin/generate_tables | 205 +++++++++++++++++++------------------- 1 file changed, 104 insertions(+), 101 deletions(-) (limited to 'activesupport/bin') diff --git a/activesupport/bin/generate_tables b/activesupport/bin/generate_tables index f8e032139f..51edb59c77 100755 --- a/activesupport/bin/generate_tables +++ b/activesupport/bin/generate_tables @@ -11,126 +11,129 @@ require 'tmpdir' module ActiveSupport module Multibyte - class UnicodeDatabase - def load; end - end - - class UnicodeDatabaseGenerator - BASE_URI = "http://www.unicode.org/Public/#{ActiveSupport::Multibyte::UNICODE_VERSION}/ucd/" - SOURCES = { - :codepoints => BASE_URI + 'UnicodeData.txt', - :composition_exclusion => BASE_URI + 'CompositionExclusions.txt', - :grapheme_break_property => BASE_URI + 'auxiliary/GraphemeBreakProperty.txt', - :cp1252 => 'http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP1252.TXT' - } - - def initialize - @ucd = UnicodeDatabase.new - - default = Codepoint.new - default.combining_class = 0 - default.uppercase_mapping = 0 - default.lowercase_mapping = 0 - @ucd.codepoints = Hash.new(default) - end + module Unicode - def parse_codepoints(line) - codepoint = Codepoint.new - raise "Could not parse input." unless line =~ /^ - ([0-9A-F]+); # code - ([^;]+); # name - ([A-Z]+); # general category - ([0-9]+); # canonical combining class - ([A-Z]+); # bidi class - (<([A-Z]*)>)? # decomposition type - ((\ ?[0-9A-F]+)*); # decompomposition mapping - ([0-9]*); # decimal digit - ([0-9]*); # digit - ([^;]*); # numeric - ([YN]*); # bidi mirrored - ([^;]*); # unicode 1.0 name - ([^;]*); # iso comment - ([0-9A-F]*); # simple uppercase mapping - ([0-9A-F]*); # simple lowercase mapping - ([0-9A-F]*)$/ix # simple titlecase mapping - codepoint.code = $1.hex - #codepoint.name = $2 - #codepoint.category = $3 - codepoint.combining_class = Integer($4) - #codepoint.bidi_class = $5 - codepoint.decomp_type = $7 - codepoint.decomp_mapping = ($8=='') ? nil : $8.split.collect { |element| element.hex } - #codepoint.bidi_mirrored = ($13=='Y') ? true : false - codepoint.uppercase_mapping = ($16=='') ? 0 : $16.hex - codepoint.lowercase_mapping = ($17=='') ? 0 : $17.hex - #codepoint.titlecase_mapping = ($18=='') ? nil : $18.hex - @ucd.codepoints[codepoint.code] = codepoint + class UnicodeDatabase + def load; end end - def parse_grapheme_break_property(line) - if line =~ /^([0-9A-F\.]+)\s*;\s*([\w]+)\s*#/ - type = $2.downcase.intern - @ucd.boundary[type] ||= [] - if $1.include? '..' - parts = $1.split '..' - @ucd.boundary[type] << (parts[0].hex..parts[1].hex) - else - @ucd.boundary[type] << $1.hex + class DatabaseGenerator + BASE_URI = "http://www.unicode.org/Public/#{UNICODE_VERSION}/ucd/" + SOURCES = { + :codepoints => BASE_URI + 'UnicodeData.txt', + :composition_exclusion => BASE_URI + 'CompositionExclusions.txt', + :grapheme_break_property => BASE_URI + 'auxiliary/GraphemeBreakProperty.txt', + :cp1252 => 'http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP1252.TXT' + } + + def initialize + @ucd = Unicode::UnicodeDatabase.new + + default = Codepoint.new + default.combining_class = 0 + default.uppercase_mapping = 0 + default.lowercase_mapping = 0 + @ucd.codepoints = Hash.new(default) + end + + def parse_codepoints(line) + codepoint = Codepoint.new + raise "Could not parse input." unless line =~ /^ + ([0-9A-F]+); # code + ([^;]+); # name + ([A-Z]+); # general category + ([0-9]+); # canonical combining class + ([A-Z]+); # bidi class + (<([A-Z]*)>)? # decomposition type + ((\ ?[0-9A-F]+)*); # decompomposition mapping + ([0-9]*); # decimal digit + ([0-9]*); # digit + ([^;]*); # numeric + ([YN]*); # bidi mirrored + ([^;]*); # unicode 1.0 name + ([^;]*); # iso comment + ([0-9A-F]*); # simple uppercase mapping + ([0-9A-F]*); # simple lowercase mapping + ([0-9A-F]*)$/ix # simple titlecase mapping + codepoint.code = $1.hex + #codepoint.name = $2 + #codepoint.category = $3 + codepoint.combining_class = Integer($4) + #codepoint.bidi_class = $5 + codepoint.decomp_type = $7 + codepoint.decomp_mapping = ($8=='') ? nil : $8.split.collect { |element| element.hex } + #codepoint.bidi_mirrored = ($13=='Y') ? true : false + codepoint.uppercase_mapping = ($16=='') ? 0 : $16.hex + codepoint.lowercase_mapping = ($17=='') ? 0 : $17.hex + #codepoint.titlecase_mapping = ($18=='') ? nil : $18.hex + @ucd.codepoints[codepoint.code] = codepoint + end + + def parse_grapheme_break_property(line) + if line =~ /^([0-9A-F\.]+)\s*;\s*([\w]+)\s*#/ + type = $2.downcase.intern + @ucd.boundary[type] ||= [] + if $1.include? '..' + parts = $1.split '..' + @ucd.boundary[type] << (parts[0].hex..parts[1].hex) + else + @ucd.boundary[type] << $1.hex + end end end - end - def parse_composition_exclusion(line) - if line =~ /^([0-9A-F]+)/i - @ucd.composition_exclusion << $1.hex + def parse_composition_exclusion(line) + if line =~ /^([0-9A-F]+)/i + @ucd.composition_exclusion << $1.hex + end end - end - def parse_cp1252(line) - if line =~ /^([0-9A-Fx]+)\s([0-9A-Fx]+)/i - @ucd.cp1252[$1.hex] = $2.hex + def parse_cp1252(line) + if line =~ /^([0-9A-Fx]+)\s([0-9A-Fx]+)/i + @ucd.cp1252[$1.hex] = $2.hex + end end - end - def create_composition_map - @ucd.codepoints.each do |_, cp| - if !cp.nil? and cp.combining_class == 0 and cp.decomp_type.nil? and !cp.decomp_mapping.nil? and cp.decomp_mapping.length == 2 and @ucd.codepoints[cp.decomp_mapping[0]].combining_class == 0 and !@ucd.composition_exclusion.include?(cp.code) - @ucd.composition_map[cp.decomp_mapping[0]] ||= {} - @ucd.composition_map[cp.decomp_mapping[0]][cp.decomp_mapping[1]] = cp.code + def create_composition_map + @ucd.codepoints.each do |_, cp| + if !cp.nil? and cp.combining_class == 0 and cp.decomp_type.nil? and !cp.decomp_mapping.nil? and cp.decomp_mapping.length == 2 and @ucd.codepoints[cp.decomp_mapping[0]].combining_class == 0 and !@ucd.composition_exclusion.include?(cp.code) + @ucd.composition_map[cp.decomp_mapping[0]] ||= {} + @ucd.composition_map[cp.decomp_mapping[0]][cp.decomp_mapping[1]] = cp.code + end end end - end - def normalize_boundary_map - @ucd.boundary.each do |k,v| - if [:lf, :cr].include? k - @ucd.boundary[k] = v[0] + def normalize_boundary_map + @ucd.boundary.each do |k,v| + if [:lf, :cr].include? k + @ucd.boundary[k] = v[0] + end end end - end - def parse - SOURCES.each do |type, url| - filename = File.join(Dir.tmpdir, "#{url.split('/').last}") - unless File.exist?(filename) - $stderr.puts "Downloading #{url.split('/').last}" - File.open(filename, 'wb') do |target| - open(url) do |source| - source.each_line { |line| target.write line } + def parse + SOURCES.each do |type, url| + filename = File.join(Dir.tmpdir, "#{url.split('/').last}") + unless File.exist?(filename) + $stderr.puts "Downloading #{url.split('/').last}" + File.open(filename, 'wb') do |target| + open(url) do |source| + source.each_line { |line| target.write line } + end end end + File.open(filename) do |file| + file.each_line { |line| send "parse_#{type}".intern, line } + end end - File.open(filename) do |file| - file.each_line { |line| send "parse_#{type}".intern, line } - end + create_composition_map + normalize_boundary_map end - create_composition_map - normalize_boundary_map - end - def dump_to(filename) - File.open(filename, 'wb') do |f| - f.write Marshal.dump([@ucd.codepoints, @ucd.composition_exclusion, @ucd.composition_map, @ucd.boundary, @ucd.cp1252]) + def dump_to(filename) + File.open(filename, 'wb') do |f| + f.write Marshal.dump([@ucd.codepoints, @ucd.composition_exclusion, @ucd.composition_map, @ucd.boundary, @ucd.cp1252]) + end end end end @@ -138,8 +141,8 @@ module ActiveSupport end if __FILE__ == $0 - filename = ActiveSupport::Multibyte::UnicodeDatabase.filename - generator = ActiveSupport::Multibyte::UnicodeDatabaseGenerator.new + filename = ActiveSupport::Multibyte::Unicode::UnicodeDatabase.filename + generator = ActiveSupport::Multibyte::Unicode::DatabaseGenerator.new generator.parse print "Writing to: #{filename}" generator.dump_to filename -- cgit v1.2.3