aboutsummaryrefslogtreecommitdiffstats
path: root/activesupport/lib/active_support/multibyte
diff options
context:
space:
mode:
authorMichael Koziarski <michael@koziarski.com>2006-10-04 09:03:57 +0000
committerMichael Koziarski <michael@koziarski.com>2006-10-04 09:03:57 +0000
commite89919082624effcd70208eb58c4e2d90b57a2a9 (patch)
tree7117b6a06dd10a05c9c37c323ce6ce161590c267 /activesupport/lib/active_support/multibyte
parentf238d495b70a264abdb864fe8107e02766b285b4 (diff)
downloadrails-e89919082624effcd70208eb58c4e2d90b57a2a9.tar.gz
rails-e89919082624effcd70208eb58c4e2d90b57a2a9.tar.bz2
rails-e89919082624effcd70208eb58c4e2d90b57a2a9.zip
Pull in latest multibyte patch. Closes #6346 [Manfred Stienstra]
git-svn-id: http://svn-commit.rubyonrails.org/rails/trunk@5224 5ecf4fe2-1ee6-0310-87b1-e25e094e27de
Diffstat (limited to 'activesupport/lib/active_support/multibyte')
-rw-r--r--activesupport/lib/active_support/multibyte/generators/generate_tables.rb12
-rw-r--r--activesupport/lib/active_support/multibyte/handlers/utf8_handler.rb28
2 files changed, 19 insertions, 21 deletions
diff --git a/activesupport/lib/active_support/multibyte/generators/generate_tables.rb b/activesupport/lib/active_support/multibyte/generators/generate_tables.rb
index 4045b94282..7f807585c5 100644
--- a/activesupport/lib/active_support/multibyte/generators/generate_tables.rb
+++ b/activesupport/lib/active_support/multibyte/generators/generate_tables.rb
@@ -18,7 +18,8 @@ module ActiveSupport::Multibyte::Handlers #:nodoc:
SOURCES = {
:codepoints => BASE_URI + 'UnicodeData.txt',
:composition_exclusion => BASE_URI + 'CompositionExclusions.txt',
- :grapheme_break_property => BASE_URI + 'auxiliary/GraphemeBreakProperty.txt'
+ :grapheme_break_property => BASE_URI + 'auxiliary/GraphemeBreakProperty.txt',
+ :cp1252 => 'http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP1252.TXT'
}
def initialize
@@ -33,6 +34,7 @@ module ActiveSupport::Multibyte::Handlers #:nodoc:
@ucd.composition_exclusion = []
@ucd.composition_map = {}
@ucd.boundary = {}
+ @ucd.cp1252 = {}
end
def parse_codepoints(line)
@@ -87,6 +89,12 @@ module ActiveSupport::Multibyte::Handlers #:nodoc:
end
end
+ def parse_cp1252(line)
+ if line =~ /^([0-9A-Fx]+)\s([0-9A-Fx]+)/i
+ @ucd.cp1252[$1.hex] = $2.hex
+ end
+ end
+
def create_composition_map
@ucd.codepoints.each do |_, cp|
if !cp.nil? and cp.combining_class == 0 and cp.decomp_type.nil? and !cp.decomp_mapping.nil? and cp.decomp_mapping.length == 2 and @ucd[cp.decomp_mapping[0]].combining_class == 0 and !@ucd.composition_exclusion.include?(cp.code)
@@ -125,7 +133,7 @@ module ActiveSupport::Multibyte::Handlers #:nodoc:
def dump_to(filename)
File.open(filename, 'wb') do |f|
- f.write Marshal.dump([@ucd.codepoints, @ucd.composition_exclusion, @ucd.composition_map, @ucd.boundary])
+ f.write Marshal.dump([@ucd.codepoints, @ucd.composition_exclusion, @ucd.composition_map, @ucd.boundary, @ucd.cp1252])
end
end
end
diff --git a/activesupport/lib/active_support/multibyte/handlers/utf8_handler.rb b/activesupport/lib/active_support/multibyte/handlers/utf8_handler.rb
index 0928209d85..c6ef6ad095 100644
--- a/activesupport/lib/active_support/multibyte/handlers/utf8_handler.rb
+++ b/activesupport/lib/active_support/multibyte/handlers/utf8_handler.rb
@@ -7,12 +7,12 @@ module ActiveSupport::Multibyte::Handlers
end
class UnicodeDatabase #:nodoc:
- attr_accessor :codepoints, :composition_exclusion, :composition_map, :boundary
+ attr_accessor :codepoints, :composition_exclusion, :composition_map, :boundary, :cp1252
# Creates a new UnicodeDatabase instance and loads the database.
def initialize
begin
- @codepoints, @composition_exclusion, @composition_map, @boundary = self.class.load
+ @codepoints, @composition_exclusion, @composition_map, @boundary, @cp1252 = self.class.load
rescue Exception => e
raise IOError.new("Couldn't load the unicode tables for UTF8Handler (#{e.message}), handler is unusable")
end
@@ -20,6 +20,7 @@ module ActiveSupport::Multibyte::Handlers
@composition_exclusion ||= []
@composition_map ||= {}
@boundary ||= {}
+ @cp1252 ||= {}
# Redefine the === method so we can write shorter rules for grapheme cluster breaks
@boundary.each do |k,_|
@@ -41,26 +42,11 @@ module ActiveSupport::Multibyte::Handlers
# Returns the filename for the data file for this version
def self.filename
- File.expand_path File.join(dirname, "unicode_tables-#{VERSION}.dat")
+ File.expand_path File.join(dirname, "unicode_tables.dat")
end
# Loads the unicode database and returns all the internal objects of UnicodeDatabase
def self.load
- begin
- return load_file(filename)
- rescue Exception
- # If we can't load our own version, try the rest
- Dir["#{dirname}/*.dat"].sort.each do |dat|
- begin
- return load_file(dat)
- rescue Exception
- end
- end
- end
- raise IOError.new("Can't load a marshal file for your version of Ruby")
- end
-
- def self.load_file(filename)
File.open(self.filename, 'rb') { |f| Marshal.load f.read }
end
end
@@ -275,7 +261,11 @@ module ActiveSupport::Multibyte::Handlers
# Strips all the non-utf-8 bytes from the string resulting in a valid utf-8 string
def tidy_bytes(str)
- str.split(//u).reject { |c| !UTF8_PAT.match(c) }.join
+ str.unpack('C*').map { |n|
+ n < 128 ? n.chr :
+ n < 160 ? [UCD.cp1252[n] || n].pack('U') :
+ n < 192 ? "\xC2" + n.chr : "\xC3" + (n-64).chr
+ }.join
end
protected