From e89919082624effcd70208eb58c4e2d90b57a2a9 Mon Sep 17 00:00:00 2001 From: Michael Koziarski Date: Wed, 4 Oct 2006 09:03:57 +0000 Subject: Pull in latest multibyte patch. Closes #6346 [Manfred Stienstra] git-svn-id: http://svn-commit.rubyonrails.org/rails/trunk@5224 5ecf4fe2-1ee6-0310-87b1-e25e094e27de --- activesupport/CHANGELOG | 4 ++- .../multibyte/generators/generate_tables.rb | 12 +++++++-- .../multibyte/handlers/utf8_handler.rb | 28 +++++++-------------- .../active_support/values/unicode_tables-1.8.4.dat | Bin 654866 -> 0 bytes .../active_support/values/unicode_tables-1.8.5.dat | Bin 654866 -> 0 bytes .../lib/active_support/values/unicode_tables.dat | Bin 0 -> 656156 bytes activesupport/test/multibyte_chars_test.rb | 7 +++--- activesupport/test/multibyte_handler_test.rb | 12 +++++++-- 8 files changed, 36 insertions(+), 27 deletions(-) delete mode 100644 activesupport/lib/active_support/values/unicode_tables-1.8.4.dat delete mode 100644 activesupport/lib/active_support/values/unicode_tables-1.8.5.dat create mode 100644 activesupport/lib/active_support/values/unicode_tables.dat (limited to 'activesupport') diff --git a/activesupport/CHANGELOG b/activesupport/CHANGELOG index cf68da71c4..0154a6d601 100644 --- a/activesupport/CHANGELOG +++ b/activesupport/CHANGELOG @@ -1,6 +1,8 @@ *SVN* -* Add ActiveSupport::Multibyte. Provides String#chars which lets you deal with strings as a sequence of chars, not of bytes. Closes #6242 [Julian Tarkhanov, Manfred Stienstra & Jan Behrens] +* Pull in latest multibye patch. Closes #6346 [Manfred Stienstra] + +* Add ActiveSupport::Multibyte. Provides String#chars which lets you deal with strings as a sequence of chars, not of bytes. Closes #6242 [Julian Tarkhanov, Manfred Stienstra, Thijs van der Vossen & Jan Behrens] * Fix issue with #class_inheritable_accessor saving updates to the parent class when initialized with an Array or Hash [mojombo] diff --git a/activesupport/lib/active_support/multibyte/generators/generate_tables.rb b/activesupport/lib/active_support/multibyte/generators/generate_tables.rb index 4045b94282..7f807585c5 100644 --- a/activesupport/lib/active_support/multibyte/generators/generate_tables.rb +++ b/activesupport/lib/active_support/multibyte/generators/generate_tables.rb @@ -18,7 +18,8 @@ module ActiveSupport::Multibyte::Handlers #:nodoc: SOURCES = { :codepoints => BASE_URI + 'UnicodeData.txt', :composition_exclusion => BASE_URI + 'CompositionExclusions.txt', - :grapheme_break_property => BASE_URI + 'auxiliary/GraphemeBreakProperty.txt' + :grapheme_break_property => BASE_URI + 'auxiliary/GraphemeBreakProperty.txt', + :cp1252 => 'http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP1252.TXT' } def initialize @@ -33,6 +34,7 @@ module ActiveSupport::Multibyte::Handlers #:nodoc: @ucd.composition_exclusion = [] @ucd.composition_map = {} @ucd.boundary = {} + @ucd.cp1252 = {} end def parse_codepoints(line) @@ -87,6 +89,12 @@ module ActiveSupport::Multibyte::Handlers #:nodoc: end end + def parse_cp1252(line) + if line =~ /^([0-9A-Fx]+)\s([0-9A-Fx]+)/i + @ucd.cp1252[$1.hex] = $2.hex + end + end + def create_composition_map @ucd.codepoints.each do |_, cp| if !cp.nil? and cp.combining_class == 0 and cp.decomp_type.nil? and !cp.decomp_mapping.nil? and cp.decomp_mapping.length == 2 and @ucd[cp.decomp_mapping[0]].combining_class == 0 and !@ucd.composition_exclusion.include?(cp.code) @@ -125,7 +133,7 @@ module ActiveSupport::Multibyte::Handlers #:nodoc: def dump_to(filename) File.open(filename, 'wb') do |f| - f.write Marshal.dump([@ucd.codepoints, @ucd.composition_exclusion, @ucd.composition_map, @ucd.boundary]) + f.write Marshal.dump([@ucd.codepoints, @ucd.composition_exclusion, @ucd.composition_map, @ucd.boundary, @ucd.cp1252]) end end end diff --git a/activesupport/lib/active_support/multibyte/handlers/utf8_handler.rb b/activesupport/lib/active_support/multibyte/handlers/utf8_handler.rb index 0928209d85..c6ef6ad095 100644 --- a/activesupport/lib/active_support/multibyte/handlers/utf8_handler.rb +++ b/activesupport/lib/active_support/multibyte/handlers/utf8_handler.rb @@ -7,12 +7,12 @@ module ActiveSupport::Multibyte::Handlers end class UnicodeDatabase #:nodoc: - attr_accessor :codepoints, :composition_exclusion, :composition_map, :boundary + attr_accessor :codepoints, :composition_exclusion, :composition_map, :boundary, :cp1252 # Creates a new UnicodeDatabase instance and loads the database. def initialize begin - @codepoints, @composition_exclusion, @composition_map, @boundary = self.class.load + @codepoints, @composition_exclusion, @composition_map, @boundary, @cp1252 = self.class.load rescue Exception => e raise IOError.new("Couldn't load the unicode tables for UTF8Handler (#{e.message}), handler is unusable") end @@ -20,6 +20,7 @@ module ActiveSupport::Multibyte::Handlers @composition_exclusion ||= [] @composition_map ||= {} @boundary ||= {} + @cp1252 ||= {} # Redefine the === method so we can write shorter rules for grapheme cluster breaks @boundary.each do |k,_| @@ -41,26 +42,11 @@ module ActiveSupport::Multibyte::Handlers # Returns the filename for the data file for this version def self.filename - File.expand_path File.join(dirname, "unicode_tables-#{VERSION}.dat") + File.expand_path File.join(dirname, "unicode_tables.dat") end # Loads the unicode database and returns all the internal objects of UnicodeDatabase def self.load - begin - return load_file(filename) - rescue Exception - # If we can't load our own version, try the rest - Dir["#{dirname}/*.dat"].sort.each do |dat| - begin - return load_file(dat) - rescue Exception - end - end - end - raise IOError.new("Can't load a marshal file for your version of Ruby") - end - - def self.load_file(filename) File.open(self.filename, 'rb') { |f| Marshal.load f.read } end end @@ -275,7 +261,11 @@ module ActiveSupport::Multibyte::Handlers # Strips all the non-utf-8 bytes from the string resulting in a valid utf-8 string def tidy_bytes(str) - str.split(//u).reject { |c| !UTF8_PAT.match(c) }.join + str.unpack('C*').map { |n| + n < 128 ? n.chr : + n < 160 ? [UCD.cp1252[n] || n].pack('U') : + n < 192 ? "\xC2" + n.chr : "\xC3" + (n-64).chr + }.join end protected diff --git a/activesupport/lib/active_support/values/unicode_tables-1.8.4.dat b/activesupport/lib/active_support/values/unicode_tables-1.8.4.dat deleted file mode 100644 index 13a081388a..0000000000 Binary files a/activesupport/lib/active_support/values/unicode_tables-1.8.4.dat and /dev/null differ diff --git a/activesupport/lib/active_support/values/unicode_tables-1.8.5.dat b/activesupport/lib/active_support/values/unicode_tables-1.8.5.dat deleted file mode 100644 index 7b96885f32..0000000000 Binary files a/activesupport/lib/active_support/values/unicode_tables-1.8.5.dat and /dev/null differ diff --git a/activesupport/lib/active_support/values/unicode_tables.dat b/activesupport/lib/active_support/values/unicode_tables.dat new file mode 100644 index 0000000000..35edb148c3 Binary files /dev/null and b/activesupport/lib/active_support/values/unicode_tables.dat differ diff --git a/activesupport/test/multibyte_chars_test.rb b/activesupport/test/multibyte_chars_test.rb index e5ad9d26ee..63aff6eb99 100644 --- a/activesupport/test/multibyte_chars_test.rb +++ b/activesupport/test/multibyte_chars_test.rb @@ -139,14 +139,15 @@ class CharsTest < Test::Unit::TestCase def test_resilience assert_nothing_raised do - assert_equal 1, @s[:bytes].chars.size, "There's only one valid utf-8 byte in the string" + assert_equal 5, @s[:bytes].chars.size, "The sequence contains five interpretable bytes" end + reversed = [0xb8, 0x17e, 0x8, 0x2c6, 0xa5].reverse.pack('U*') assert_nothing_raised do - assert_equal "\010", @s[:bytes].chars.reverse, "There's only one valid utf-8 byte in the string" + assert_equal reversed, @s[:bytes].chars.reverse.to_s, "Reversing the string should only yield interpretable bytes" end assert_nothing_raised do @s[:bytes].chars.reverse! - assert_equal "\010", @s[:bytes], "There's only one valid utf-8 byte in the string" + assert_equal reversed, @s[:bytes].to_s, "Reversing the string should only yield interpretable bytes" end end diff --git a/activesupport/test/multibyte_handler_test.rb b/activesupport/test/multibyte_handler_test.rb index 08291c1212..7de3c0001e 100644 --- a/activesupport/test/multibyte_handler_test.rb +++ b/activesupport/test/multibyte_handler_test.rb @@ -224,9 +224,17 @@ module UTF8HandlingTest end def test_tidy_bytes - assert_equal "\010", @handler.tidy_bytes(@bytestring) - assert_equal "a\010a", @handler.tidy_bytes('a' + @bytestring + 'a') + result = [0xb8, 0x17e, 0x8, 0x2c6, 0xa5].pack('U*') + assert_equal result, @handler.tidy_bytes(@bytestring) + assert_equal "a#{result}a", @handler.tidy_bytes('a' + @bytestring + 'a') assert_nothing_raised { @handler.tidy_bytes(@bytestring).unpack('U*') } + + assert_equal "\xC3\xA7", @handler.tidy_bytes("\xE7") # iso_8859_1: small c cedilla + assert_equal "\xC2\xA9", @handler.tidy_bytes("\xA9") # iso_8859_1: copyright symbol + assert_equal "\xE2\x80\x9C", @handler.tidy_bytes("\x93") # win_1252: left smart quote + assert_equal "\xE2\x82\xAC", @handler.tidy_bytes("\x80") # win_1252: euro + assert_equal "\x00", @handler.tidy_bytes("\x00") # null char + assert_equal [0xef, 0xbf, 0xbd].pack('U*'), @handler.tidy_bytes("\xef\xbf\xbd") # invalid char end protected -- cgit v1.2.3