diff options
author | Xavier Noria <fxn@hashref.com> | 2010-05-27 10:57:05 +0200 |
---|---|---|
committer | Xavier Noria <fxn@hashref.com> | 2010-05-27 10:57:05 +0200 |
commit | ddec74fb3ab66528ca98ff37ca9b5f4227fdd2e3 (patch) | |
tree | c58ac9162487bdca68342fb2d30730ccfd926378 /activesupport/lib | |
parent | 96e2094b8b634e4af0d9d3c8a1db9bbb7023a4a7 (diff) | |
parent | e02db06ece7aeecec7c37f5b0e3de7d65c8684e6 (diff) | |
download | rails-ddec74fb3ab66528ca98ff37ca9b5f4227fdd2e3.tar.gz rails-ddec74fb3ab66528ca98ff37ca9b5f4227fdd2e3.tar.bz2 rails-ddec74fb3ab66528ca98ff37ca9b5f4227fdd2e3.zip |
Merge remote branch 'rails/master'
Diffstat (limited to 'activesupport/lib')
-rw-r--r-- | activesupport/lib/active_support/callbacks.rb | 51 | ||||
-rw-r--r-- | activesupport/lib/active_support/core_ext/class/attribute.rb | 5 | ||||
-rw-r--r-- | activesupport/lib/active_support/core_ext/module/attr_internal.rb | 4 | ||||
-rw-r--r-- | activesupport/lib/active_support/core_ext/string/multibyte.rb | 30 | ||||
-rw-r--r-- | activesupport/lib/active_support/dependencies.rb | 2 | ||||
-rw-r--r-- | activesupport/lib/active_support/duration.rb | 1 | ||||
-rw-r--r-- | activesupport/lib/active_support/inflector/transliterate.rb | 5 | ||||
-rw-r--r-- | activesupport/lib/active_support/multibyte.rb | 20 | ||||
-rw-r--r-- | activesupport/lib/active_support/multibyte/chars.rb | 577 | ||||
-rw-r--r-- | activesupport/lib/active_support/multibyte/unicode.rb | 393 | ||||
-rw-r--r-- | activesupport/lib/active_support/multibyte/unicode_database.rb | 71 | ||||
-rw-r--r-- | activesupport/lib/active_support/railtie.rb | 5 | ||||
-rw-r--r-- | activesupport/lib/active_support/testing/declarative.rb | 2 | ||||
-rw-r--r-- | activesupport/lib/active_support/values/unicode_tables.dat | bin | 710734 -> 710743 bytes |
14 files changed, 588 insertions, 578 deletions
diff --git a/activesupport/lib/active_support/callbacks.rb b/activesupport/lib/active_support/callbacks.rb index 5a7b94ead7..933667c909 100644 --- a/activesupport/lib/active_support/callbacks.rb +++ b/activesupport/lib/active_support/callbacks.rb @@ -203,8 +203,8 @@ module ActiveSupport # end # name = "_conditional_callback_#{@kind}_#{next_id}" - txt, line = <<-RUBY_EVAL, __LINE__ + 1 - def #{name}(halted) + @klass.class_eval <<-RUBY_EVAL, __FILE__, __LINE__ + 1 + def #{name}(halted) #{@compiled_options[0] || "if true"} && !halted #{@filter} do yield self @@ -214,7 +214,6 @@ module ActiveSupport end end RUBY_EVAL - @klass.class_eval(txt, __FILE__, line) "#{name}(halted) do" end end @@ -312,9 +311,9 @@ module ActiveSupport def _normalize_legacy_filter(kind, filter) if !filter.respond_to?(kind) && filter.respond_to?(:filter) - filter.singleton_class.class_eval( - "def #{kind}(context, &block) filter(context, &block) end", - __FILE__, __LINE__ - 1) + filter.singleton_class.class_eval <<-RUBY_EVAL, __FILE__, __LINE__ + 1 + def #{kind}(context, &block) filter(context, &block) end + RUBY_EVAL elsif filter.respond_to?(:before) && filter.respond_to?(:after) && kind == :around def filter.around(context) should_continue = before(context) @@ -387,31 +386,29 @@ module ActiveSupport send("_update_#{symbol}_superclass_callbacks") body = send("_#{symbol}_callbacks").compile(nil) - body, line = <<-RUBY_EVAL, __LINE__ + 1 - def _run_#{symbol}_callbacks(key = nil, &blk) - if self.class.send("_update_#{symbol}_superclass_callbacks") - self.class.__define_runner(#{symbol.inspect}) - return _run_#{symbol}_callbacks(key, &blk) - end + silence_warnings do + undef_method "_run_#{symbol}_callbacks" if method_defined?("_run_#{symbol}_callbacks") + class_eval <<-RUBY_EVAL, __FILE__, __LINE__ + 1 + def _run_#{symbol}_callbacks(key = nil, &blk) + if self.class.send("_update_#{symbol}_superclass_callbacks") + self.class.__define_runner(#{symbol.inspect}) + return _run_#{symbol}_callbacks(key, &blk) + end - if key - name = "_run__\#{self.class.name.hash.abs}__#{symbol}__\#{key.hash.abs}__callbacks" + if key + name = "_run__\#{self.class.name.hash.abs}__#{symbol}__\#{key.hash.abs}__callbacks" - unless respond_to?(name) - self.class.__create_keyed_callback(name, :#{symbol}, self, &blk) - end + unless respond_to?(name) + self.class.__create_keyed_callback(name, :#{symbol}, self, &blk) + end - send(name, &blk) - else - #{body} + send(name, &blk) + else + #{body} + end end - end - private :_run_#{symbol}_callbacks - RUBY_EVAL - - silence_warnings do - undef_method "_run_#{symbol}_callbacks" if method_defined?("_run_#{symbol}_callbacks") - class_eval body, __FILE__, line + private :_run_#{symbol}_callbacks + RUBY_EVAL end end diff --git a/activesupport/lib/active_support/core_ext/class/attribute.rb b/activesupport/lib/active_support/core_ext/class/attribute.rb index d2bcd7a778..576366e496 100644 --- a/activesupport/lib/active_support/core_ext/class/attribute.rb +++ b/activesupport/lib/active_support/core_ext/class/attribute.rb @@ -61,10 +61,7 @@ class Class end RUBY - if instance_writer - body = "def #{name}=(value) @#{name} = value end" - class_eval body, __FILE__, __LINE__ - 1 - end + attr_writer name if instance_writer end end end diff --git a/activesupport/lib/active_support/core_ext/module/attr_internal.rb b/activesupport/lib/active_support/core_ext/module/attr_internal.rb index d052bfed2d..28bc30ae26 100644 --- a/activesupport/lib/active_support/core_ext/module/attr_internal.rb +++ b/activesupport/lib/active_support/core_ext/module/attr_internal.rb @@ -2,14 +2,14 @@ class Module # Declares an attribute reader backed by an internally-named instance variable. def attr_internal_reader(*attrs) attrs.each do |attr| - module_eval "def #{attr}() #{attr_internal_ivar_name(attr)} end" + module_eval "def #{attr}() #{attr_internal_ivar_name(attr)} end", __FILE__, __LINE__ end end # Declares an attribute writer backed by an internally-named instance variable. def attr_internal_writer(*attrs) attrs.each do |attr| - module_eval "def #{attr}=(v) #{attr_internal_ivar_name(attr)} = v end" + module_eval "def #{attr}=(v) #{attr_internal_ivar_name(attr)} = v end", __FILE__, __LINE__ end end diff --git a/activesupport/lib/active_support/core_ext/string/multibyte.rb b/activesupport/lib/active_support/core_ext/string/multibyte.rb index 42e053d0f8..3dfe996d06 100644 --- a/activesupport/lib/active_support/core_ext/string/multibyte.rb +++ b/activesupport/lib/active_support/core_ext/string/multibyte.rb @@ -2,7 +2,7 @@ require 'active_support/multibyte' class String - unless '1.9'.respond_to?(:force_encoding) + if '1.9'.respond_to?(:force_encoding) # == Multibyte proxy # # +mb_chars+ is a multibyte safe proxy for string methods. @@ -37,23 +37,13 @@ class String # For more information about the methods defined on the Chars proxy see ActiveSupport::Multibyte::Chars. For # information about how to change the default Multibyte behaviour see ActiveSupport::Multibyte. def mb_chars - if ActiveSupport::Multibyte.proxy_class.wants?(self) + if ActiveSupport::Multibyte.proxy_class.consumes?(self) ActiveSupport::Multibyte.proxy_class.new(self) else self end end - - # Returns true if the string has UTF-8 semantics (a String used for purely byte resources is unlikely to have - # them), returns false otherwise. - def is_utf8? - ActiveSupport::Multibyte::Chars.consumes?(self) - end - else - def mb_chars #:nodoc - self - end - + def is_utf8? #:nodoc case encoding when Encoding::UTF_8 @@ -64,5 +54,19 @@ class String false end end + else + def mb_chars + if ActiveSupport::Multibyte.proxy_class.wants?(self) + ActiveSupport::Multibyte.proxy_class.new(self) + else + self + end + end + + # Returns true if the string has UTF-8 semantics (a String used for purely byte resources is unlikely to have + # them), returns false otherwise. + def is_utf8? + ActiveSupport::Multibyte::Chars.consumes?(self) + end end end diff --git a/activesupport/lib/active_support/dependencies.rb b/activesupport/lib/active_support/dependencies.rb index 9c4412c28c..e14e225596 100644 --- a/activesupport/lib/active_support/dependencies.rb +++ b/activesupport/lib/active_support/dependencies.rb @@ -66,7 +66,7 @@ module ActiveSupport #:nodoc: end def self.locked(*methods) - methods.each { |m| class_eval "def #{m}(*) lock { super } end" } + methods.each { |m| class_eval "def #{m}(*) lock { super } end", __FILE__, __LINE__ } end def get(key) diff --git a/activesupport/lib/active_support/duration.rb b/activesupport/lib/active_support/duration.rb index db5afb5324..cd0d66a482 100644 --- a/activesupport/lib/active_support/duration.rb +++ b/activesupport/lib/active_support/duration.rb @@ -38,6 +38,7 @@ module ActiveSupport def is_a?(klass) #:nodoc: Duration == klass || value.is_a?(klass) end + alias :kind_of? :is_a? # Returns true if <tt>other</tt> is also a Duration instance with the # same <tt>value</tt>, or if <tt>other == value</tt>. diff --git a/activesupport/lib/active_support/inflector/transliterate.rb b/activesupport/lib/active_support/inflector/transliterate.rb index 2344bb1bb3..bccc5425a6 100644 --- a/activesupport/lib/active_support/inflector/transliterate.rb +++ b/activesupport/lib/active_support/inflector/transliterate.rb @@ -58,8 +58,9 @@ module ActiveSupport # transliterate("Jürgen") # # => "Juergen" def transliterate(string, replacement = "?") - I18n.transliterate(Multibyte::Chars.normalize( - Multibyte::Chars.tidy_bytes(string), :c), :replacement => replacement) + I18n.transliterate(ActiveSupport::Multibyte::Unicode.normalize( + ActiveSupport::Multibyte::Unicode.tidy_bytes(string), :c), + :replacement => replacement) end # Replaces special characters in a string so that it may be used as part of a 'pretty' URL. diff --git a/activesupport/lib/active_support/multibyte.rb b/activesupport/lib/active_support/multibyte.rb index 428c48a484..e7a271a660 100644 --- a/activesupport/lib/active_support/multibyte.rb +++ b/activesupport/lib/active_support/multibyte.rb @@ -1,30 +1,12 @@ # encoding: utf-8 - require 'active_support/core_ext/module/attribute_accessors' module ActiveSupport #:nodoc: module Multibyte autoload :EncodingError, 'active_support/multibyte/exceptions' autoload :Chars, 'active_support/multibyte/chars' - autoload :UnicodeDatabase, 'active_support/multibyte/unicode_database' - autoload :Codepoint, 'active_support/multibyte/unicode_database' - autoload :UCD, 'active_support/multibyte/unicode_database' + autoload :Unicode, 'active_support/multibyte/unicode' - # A list of all available normalization forms. See http://www.unicode.org/reports/tr15/tr15-29.html for more - # information about normalization. - NORMALIZATION_FORMS = [:c, :kc, :d, :kd] - - # The Unicode version that is supported by the implementation - UNICODE_VERSION = '5.1.0' - - # The default normalization used for operations that require normalization. It can be set to any of the - # normalizations in NORMALIZATION_FORMS. - # - # Example: - # ActiveSupport::Multibyte.default_normalization_form = :c - mattr_accessor :default_normalization_form - self.default_normalization_form = :kc - # The proxy class returned when calling mb_chars. You can use this accessor to configure your own proxy # class so you can support other encodings. See the ActiveSupport::Multibyte::Chars implementation for # an example how to do this. diff --git a/activesupport/lib/active_support/multibyte/chars.rb b/activesupport/lib/active_support/multibyte/chars.rb index cca30d1141..429b65bf15 100644 --- a/activesupport/lib/active_support/multibyte/chars.rb +++ b/activesupport/lib/active_support/multibyte/chars.rb @@ -34,52 +34,12 @@ module ActiveSupport #:nodoc: # # ActiveSupport::Multibyte.proxy_class = CharsForUTF32 class Chars - # Hangul character boundaries and properties - HANGUL_SBASE = 0xAC00 - HANGUL_LBASE = 0x1100 - HANGUL_VBASE = 0x1161 - HANGUL_TBASE = 0x11A7 - HANGUL_LCOUNT = 19 - HANGUL_VCOUNT = 21 - HANGUL_TCOUNT = 28 - HANGUL_NCOUNT = HANGUL_VCOUNT * HANGUL_TCOUNT - HANGUL_SCOUNT = 11172 - HANGUL_SLAST = HANGUL_SBASE + HANGUL_SCOUNT - HANGUL_JAMO_FIRST = 0x1100 - HANGUL_JAMO_LAST = 0x11FF - - # All the unicode whitespace - UNICODE_WHITESPACE = [ - (0x0009..0x000D).to_a, # White_Space # Cc [5] <control-0009>..<control-000D> - 0x0020, # White_Space # Zs SPACE - 0x0085, # White_Space # Cc <control-0085> - 0x00A0, # White_Space # Zs NO-BREAK SPACE - 0x1680, # White_Space # Zs OGHAM SPACE MARK - 0x180E, # White_Space # Zs MONGOLIAN VOWEL SEPARATOR - (0x2000..0x200A).to_a, # White_Space # Zs [11] EN QUAD..HAIR SPACE - 0x2028, # White_Space # Zl LINE SEPARATOR - 0x2029, # White_Space # Zp PARAGRAPH SEPARATOR - 0x202F, # White_Space # Zs NARROW NO-BREAK SPACE - 0x205F, # White_Space # Zs MEDIUM MATHEMATICAL SPACE - 0x3000, # White_Space # Zs IDEOGRAPHIC SPACE - ].flatten.freeze - - # BOM (byte order mark) can also be seen as whitespace, it's a non-rendering character used to distinguish - # between little and big endian. This is not an issue in utf-8, so it must be ignored. - UNICODE_LEADERS_AND_TRAILERS = UNICODE_WHITESPACE + [65279] # ZERO-WIDTH NO-BREAK SPACE aka BOM - - # Returns a regular expression pattern that matches the passed Unicode codepoints - def self.codepoints_to_pattern(array_of_codepoints) #:nodoc: - array_of_codepoints.collect{ |e| [e].pack 'U*' }.join('|') - end - UNICODE_TRAILERS_PAT = /(#{codepoints_to_pattern(UNICODE_LEADERS_AND_TRAILERS)})+\Z/u - UNICODE_LEADERS_PAT = /\A(#{codepoints_to_pattern(UNICODE_LEADERS_AND_TRAILERS)})+/u attr_reader :wrapped_string alias to_s wrapped_string alias to_str wrapped_string - if '1.9'.respond_to?(:force_encoding) + if RUBY_VERSION >= "1.9" # Creates a new Chars instance by wrapping _string_. def initialize(string) @wrapped_string = string @@ -113,12 +73,6 @@ module ActiveSupport #:nodoc: true end - # Returns +true+ if the Chars class can and should act as a proxy for the string _string_. Returns - # +false+ otherwise. - def self.wants?(string) - $KCODE == 'UTF8' && consumes?(string) - end - # Returns +true+ when the proxy class can handle the string. Returns +false+ otherwise. def self.consumes?(string) # Unpack is a little bit faster than regular expressions. @@ -130,89 +84,131 @@ module ActiveSupport #:nodoc: include Comparable - # Returns <tt>-1</tt>, <tt>0</tt> or <tt>+1</tt> depending on whether the Chars object is to be sorted before, - # equal or after the object on the right side of the operation. It accepts any object that implements +to_s+. - # See <tt>String#<=></tt> for more details. - # - # Example: - # 'é'.mb_chars <=> 'ü'.mb_chars #=> -1 - def <=>(other) - @wrapped_string <=> other.to_s - end + if RUBY_VERSION < "1.9" + # Returns +true+ if the Chars class can and should act as a proxy for the string _string_. Returns + # +false+ otherwise. + def self.wants?(string) + $KCODE == 'UTF8' && consumes?(string) + end - # Returns a new Chars object containing the _other_ object concatenated to the string. - # - # Example: - # ('Café'.mb_chars + ' périferôl').to_s #=> "Café périferôl" - def +(other) - self << other - end + # Returns <tt>-1</tt>, <tt>0</tt> or <tt>+1</tt> depending on whether the Chars object is to be sorted before, + # equal or after the object on the right side of the operation. It accepts any object that implements +to_s+. + # See <tt>String#<=></tt> for more details. + # + # Example: + # 'é'.mb_chars <=> 'ü'.mb_chars #=> -1 + def <=>(other) + @wrapped_string <=> other.to_s + end - # Like <tt>String#=~</tt> only it returns the character offset (in codepoints) instead of the byte offset. - # - # Example: - # 'Café périferôl'.mb_chars =~ /ô/ #=> 12 - def =~(other) - translate_offset(@wrapped_string =~ other) - end + # Returns a new Chars object containing the _other_ object concatenated to the string. + # + # Example: + # ('Café'.mb_chars + ' périferôl').to_s #=> "Café périferôl" + def +(other) + self << other + end - # Works just like <tt>String#split</tt>, with the exception that the items in the resulting list are Chars - # instances instead of String. This makes chaining methods easier. - # - # Example: - # 'Café périferôl'.mb_chars.split(/é/).map { |part| part.upcase.to_s } #=> ["CAF", " P", "RIFERÔL"] - def split(*args) - @wrapped_string.split(*args).map { |i| i.mb_chars } - end + # Like <tt>String#=~</tt> only it returns the character offset (in codepoints) instead of the byte offset. + # + # Example: + # 'Café périferôl'.mb_chars =~ /ô/ #=> 12 + def =~(other) + translate_offset(@wrapped_string =~ other) + end - # Inserts the passed string at specified codepoint offsets. - # - # Example: - # 'Café'.mb_chars.insert(4, ' périferôl').to_s #=> "Café périferôl" - def insert(offset, fragment) - unpacked = self.class.u_unpack(@wrapped_string) - unless offset > unpacked.length - @wrapped_string.replace( - self.class.u_unpack(@wrapped_string).insert(offset, *self.class.u_unpack(fragment)).pack('U*') - ) - else - raise IndexError, "index #{offset} out of string" + # Inserts the passed string at specified codepoint offsets. + # + # Example: + # 'Café'.mb_chars.insert(4, ' périferôl').to_s #=> "Café périferôl" + def insert(offset, fragment) + unpacked = Unicode.u_unpack(@wrapped_string) + unless offset > unpacked.length + @wrapped_string.replace( + Unicode.u_unpack(@wrapped_string).insert(offset, *Unicode.u_unpack(fragment)).pack('U*') + ) + else + raise IndexError, "index #{offset} out of string" + end + self end - self - end - # Returns +true+ if contained string contains _other_. Returns +false+ otherwise. - # - # Example: - # 'Café'.mb_chars.include?('é') #=> true - def include?(other) - # We have to redefine this method because Enumerable defines it. - @wrapped_string.include?(other) - end + # Returns +true+ if contained string contains _other_. Returns +false+ otherwise. + # + # Example: + # 'Café'.mb_chars.include?('é') #=> true + def include?(other) + # We have to redefine this method because Enumerable defines it. + @wrapped_string.include?(other) + end - # Returns the position _needle_ in the string, counting in codepoints. Returns +nil+ if _needle_ isn't found. - # - # Example: - # 'Café périferôl'.mb_chars.index('ô') #=> 12 - # 'Café périferôl'.mb_chars.index(/\w/u) #=> 0 - def index(needle, offset=0) - wrapped_offset = first(offset).wrapped_string.length - index = @wrapped_string.index(needle, wrapped_offset) - index ? (self.class.u_unpack(@wrapped_string.slice(0...index)).size) : nil + # Returns the position _needle_ in the string, counting in codepoints. Returns +nil+ if _needle_ isn't found. + # + # Example: + # 'Café périferôl'.mb_chars.index('ô') #=> 12 + # 'Café périferôl'.mb_chars.index(/\w/u) #=> 0 + def index(needle, offset=0) + wrapped_offset = first(offset).wrapped_string.length + index = @wrapped_string.index(needle, wrapped_offset) + index ? (Unicode.u_unpack(@wrapped_string.slice(0...index)).size) : nil + end + + # Returns the position _needle_ in the string, counting in + # codepoints, searching backward from _offset_ or the end of the + # string. Returns +nil+ if _needle_ isn't found. + # + # Example: + # 'Café périferôl'.mb_chars.rindex('é') #=> 6 + # 'Café périferôl'.mb_chars.rindex(/\w/u) #=> 13 + def rindex(needle, offset=nil) + offset ||= length + wrapped_offset = first(offset).wrapped_string.length + index = @wrapped_string.rindex(needle, wrapped_offset) + index ? (Unicode.u_unpack(@wrapped_string.slice(0...index)).size) : nil + end + + # Returns the number of codepoints in the string + def size + Unicode.u_unpack(@wrapped_string).size + end + alias_method :length, :size + + # Strips entire range of Unicode whitespace from the right of the string. + def rstrip + chars(@wrapped_string.gsub(Unicode::TRAILERS_PAT, '')) + end + + # Strips entire range of Unicode whitespace from the left of the string. + def lstrip + chars(@wrapped_string.gsub(Unicode::LEADERS_PAT, '')) + end + + # Strips entire range of Unicode whitespace from the right and left of the string. + def strip + rstrip.lstrip + end + + # Returns the codepoint of the first character in the string. + # + # Example: + # 'こんにちは'.mb_chars.ord #=> 12371 + def ord + Unicode.u_unpack(@wrapped_string)[0] + end + + else + def =~(other) + @wrapped_string =~ other + end end - # Returns the position _needle_ in the string, counting in - # codepoints, searching backward from _offset_ or the end of the - # string. Returns +nil+ if _needle_ isn't found. + # Works just like <tt>String#split</tt>, with the exception that the items in the resulting list are Chars + # instances instead of String. This makes chaining methods easier. # # Example: - # 'Café périferôl'.mb_chars.rindex('é') #=> 6 - # 'Café périferôl'.mb_chars.rindex(/\w/u) #=> 13 - def rindex(needle, offset=nil) - offset ||= length - wrapped_offset = first(offset).wrapped_string.length - index = @wrapped_string.rindex(needle, wrapped_offset) - index ? (self.class.u_unpack(@wrapped_string.slice(0...index)).size) : nil + # 'Café périferôl'.mb_chars.split(/é/).map { |part| part.upcase.to_s } #=> ["CAF", " P", "RIFERÔL"] + def split(*args) + @wrapped_string.split(*args).map { |i| i.mb_chars } end # Like <tt>String#[]=</tt>, except instead of byte offsets you specify character offsets. @@ -234,7 +230,7 @@ module ActiveSupport #:nodoc: if args.first.is_a?(Regexp) @wrapped_string[*args] = replace_by else - result = self.class.u_unpack(@wrapped_string) + result = Unicode.u_unpack(@wrapped_string) if args[0].is_a?(Fixnum) raise IndexError, "index #{args[0]} out of string" if args[0] >= result.length min = args[0] @@ -247,10 +243,10 @@ module ActiveSupport #:nodoc: else needle = args[0].to_s min = index(needle) - max = min + self.class.u_unpack(needle).length - 1 + max = min + Unicode.u_unpack(needle).length - 1 range = Range.new(min, max) end - result[range] = self.class.u_unpack(replace_by) + result[range] = Unicode.u_unpack(replace_by) @wrapped_string.replace(result.pack('U*')) end end @@ -294,33 +290,13 @@ module ActiveSupport #:nodoc: justify(integer, :center, padstr) end - # Strips entire range of Unicode whitespace from the right of the string. - def rstrip - chars(@wrapped_string.gsub(UNICODE_TRAILERS_PAT, '')) - end - - # Strips entire range of Unicode whitespace from the left of the string. - def lstrip - chars(@wrapped_string.gsub(UNICODE_LEADERS_PAT, '')) - end - - # Strips entire range of Unicode whitespace from the right and left of the string. - def strip - rstrip.lstrip - end - - # Returns the number of codepoints in the string - def size - self.class.u_unpack(@wrapped_string).size - end - alias_method :length, :size # Reverses all characters in the string. # # Example: # 'Café'.mb_chars.reverse.to_s #=> 'éfaC' def reverse - chars(self.class.g_unpack(@wrapped_string).reverse.flatten.pack('U*')) + chars(Unicode.g_unpack(@wrapped_string).reverse.flatten.pack('U*')) end # Implements Unicode-aware slice with codepoints. Slicing on one point returns the codepoints for that @@ -336,15 +312,15 @@ module ActiveSupport #:nodoc: elsif (args.size == 2 && !args[1].is_a?(Numeric)) raise TypeError, "cannot convert #{args[1].class} into Integer" # Do as if we were native elsif args[0].kind_of? Range - cps = self.class.u_unpack(@wrapped_string).slice(*args) + cps = Unicode.u_unpack(@wrapped_string).slice(*args) result = cps.nil? ? nil : cps.pack('U*') elsif args[0].kind_of? Regexp result = @wrapped_string.slice(*args) elsif args.size == 1 && args[0].kind_of?(Numeric) - character = self.class.u_unpack(@wrapped_string)[args[0]] + character = Unicode.u_unpack(@wrapped_string)[args[0]] result = character.nil? ? nil : [character].pack('U') else - result = self.class.u_unpack(@wrapped_string).slice(*args).pack('U*') + result = Unicode.u_unpack(@wrapped_string).slice(*args).pack('U*') end result.nil? ? nil : chars(result) end @@ -372,20 +348,12 @@ module ActiveSupport #:nodoc: slice(0...translate_offset(limit)) end - # Returns the codepoint of the first character in the string. - # - # Example: - # 'こんにちは'.mb_chars.ord #=> 12371 - def ord - self.class.u_unpack(@wrapped_string)[0] - end - # Convert characters in the string to uppercase. # # Example: # 'Laurent, où sont les tests ?'.mb_chars.upcase.to_s #=> "LAURENT, OÙ SONT LES TESTS ?" def upcase - apply_mapping :uppercase_mapping + chars(Unicode.apply_mapping @wrapped_string, :uppercase_mapping) end # Convert characters in the string to lowercase. @@ -393,7 +361,7 @@ module ActiveSupport #:nodoc: # Example: # 'VĚDA A VÝZKUM'.mb_chars.downcase.to_s #=> "věda a výzkum" def downcase - apply_mapping :lowercase_mapping + chars(Unicode.apply_mapping @wrapped_string, :lowercase_mapping) end # Converts the first character to uppercase and the remainder to lowercase. @@ -409,9 +377,9 @@ module ActiveSupport #:nodoc: # # * <tt>form</tt> - The form you want to normalize in. Should be one of the following: # <tt>:c</tt>, <tt>:kc</tt>, <tt>:d</tt>, or <tt>:kd</tt>. Default is - # ActiveSupport::Multibyte.default_normalization_form - def normalize(form=ActiveSupport::Multibyte.default_normalization_form) - chars(self.class.normalize(@wrapped_string, form)) + # ActiveSupport::Multibyte::Unicode.default_normalization_form + def normalize(form = nil) + chars(Unicode.normalize(@wrapped_string, form)) end # Performs canonical decomposition on all the characters. @@ -420,7 +388,7 @@ module ActiveSupport #:nodoc: # 'é'.length #=> 2 # 'é'.mb_chars.decompose.to_s.length #=> 3 def decompose - chars(self.class.decompose_codepoints(:canonical, self.class.u_unpack(@wrapped_string)).pack('U*')) + chars(Unicode.decompose_codepoints(:canonical, Unicode.u_unpack(@wrapped_string)).pack('U*')) end # Performs composition on all the characters. @@ -429,7 +397,7 @@ module ActiveSupport #:nodoc: # 'é'.length #=> 3 # 'é'.mb_chars.compose.to_s.length #=> 2 def compose - chars(self.class.compose_codepoints(self.class.u_unpack(@wrapped_string)).pack('U*')) + chars(Unicode.compose_codepoints(Unicode.u_unpack(@wrapped_string)).pack('U*')) end # Returns the number of grapheme clusters in the string. @@ -438,14 +406,14 @@ module ActiveSupport #:nodoc: # 'क्षि'.mb_chars.length #=> 4 # 'क्षि'.mb_chars.g_length #=> 3 def g_length - self.class.g_unpack(@wrapped_string).length + Unicode.g_unpack(@wrapped_string).length end # Replaces all ISO-8859-1 or CP1252 characters by their UTF-8 equivalent resulting in a valid UTF-8 string. # # Passing +true+ will forcibly tidy all bytes, assuming that the string's encoding is entirely CP1252 or ISO-8859-1. def tidy_bytes(force = false) - chars(self.class.tidy_bytes(@wrapped_string, force)) + chars(Unicode.tidy_bytes(@wrapped_string, force)) end %w(lstrip rstrip strip reverse upcase downcase tidy_bytes capitalize).each do |method| @@ -459,266 +427,6 @@ module ActiveSupport #:nodoc: end end - class << self - - # Unpack the string at codepoints boundaries. Raises an EncodingError when the encoding of the string isn't - # valid UTF-8. - # - # Example: - # Chars.u_unpack('Café') #=> [67, 97, 102, 233] - def u_unpack(string) - begin - string.unpack 'U*' - rescue ArgumentError - raise EncodingError, 'malformed UTF-8 character' - end - end - - # Detect whether the codepoint is in a certain character class. Returns +true+ when it's in the specified - # character class and +false+ otherwise. Valid character classes are: <tt>:cr</tt>, <tt>:lf</tt>, <tt>:l</tt>, - # <tt>:v</tt>, <tt>:lv</tt>, <tt>:lvt</tt> and <tt>:t</tt>. - # - # Primarily used by the grapheme cluster support. - def in_char_class?(codepoint, classes) - classes.detect { |c| UCD.boundary[c] === codepoint } ? true : false - end - - # Unpack the string at grapheme boundaries. Returns a list of character lists. - # - # Example: - # Chars.g_unpack('क्षि') #=> [[2325, 2381], [2359], [2367]] - # Chars.g_unpack('Café') #=> [[67], [97], [102], [233]] - def g_unpack(string) - codepoints = u_unpack(string) - unpacked = [] - pos = 0 - marker = 0 - eoc = codepoints.length - while(pos < eoc) - pos += 1 - previous = codepoints[pos-1] - current = codepoints[pos] - if ( - # CR X LF - one = ( previous == UCD.boundary[:cr] and current == UCD.boundary[:lf] ) or - # L X (L|V|LV|LVT) - two = ( UCD.boundary[:l] === previous and in_char_class?(current, [:l,:v,:lv,:lvt]) ) or - # (LV|V) X (V|T) - three = ( in_char_class?(previous, [:lv,:v]) and in_char_class?(current, [:v,:t]) ) or - # (LVT|T) X (T) - four = ( in_char_class?(previous, [:lvt,:t]) and UCD.boundary[:t] === current ) or - # X Extend - five = (UCD.boundary[:extend] === current) - ) - else - unpacked << codepoints[marker..pos-1] - marker = pos - end - end - unpacked - end - - # Reverse operation of g_unpack. - # - # Example: - # Chars.g_pack(Chars.g_unpack('क्षि')) #=> 'क्षि' - def g_pack(unpacked) - (unpacked.flatten).pack('U*') - end - - def padding(padsize, padstr=' ') #:nodoc: - if padsize != 0 - new(padstr * ((padsize / u_unpack(padstr).size) + 1)).slice(0, padsize) - else - '' - end - end - - # Re-order codepoints so the string becomes canonical. - def reorder_characters(codepoints) - length = codepoints.length- 1 - pos = 0 - while pos < length do - cp1, cp2 = UCD.codepoints[codepoints[pos]], UCD.codepoints[codepoints[pos+1]] - if (cp1.combining_class > cp2.combining_class) && (cp2.combining_class > 0) - codepoints[pos..pos+1] = cp2.code, cp1.code - pos += (pos > 0 ? -1 : 1) - else - pos += 1 - end - end - codepoints - end - - # Decompose composed characters to the decomposed form. - def decompose_codepoints(type, codepoints) - codepoints.inject([]) do |decomposed, cp| - # if it's a hangul syllable starter character - if HANGUL_SBASE <= cp and cp < HANGUL_SLAST - sindex = cp - HANGUL_SBASE - ncp = [] # new codepoints - ncp << HANGUL_LBASE + sindex / HANGUL_NCOUNT - ncp << HANGUL_VBASE + (sindex % HANGUL_NCOUNT) / HANGUL_TCOUNT - tindex = sindex % HANGUL_TCOUNT - ncp << (HANGUL_TBASE + tindex) unless tindex == 0 - decomposed.concat ncp - # if the codepoint is decomposable in with the current decomposition type - elsif (ncp = UCD.codepoints[cp].decomp_mapping) and (!UCD.codepoints[cp].decomp_type || type == :compatability) - decomposed.concat decompose_codepoints(type, ncp.dup) - else - decomposed << cp - end - end - end - - # Compose decomposed characters to the composed form. - def compose_codepoints(codepoints) - pos = 0 - eoa = codepoints.length - 1 - starter_pos = 0 - starter_char = codepoints[0] - previous_combining_class = -1 - while pos < eoa - pos += 1 - lindex = starter_char - HANGUL_LBASE - # -- Hangul - if 0 <= lindex and lindex < HANGUL_LCOUNT - vindex = codepoints[starter_pos+1] - HANGUL_VBASE rescue vindex = -1 - if 0 <= vindex and vindex < HANGUL_VCOUNT - tindex = codepoints[starter_pos+2] - HANGUL_TBASE rescue tindex = -1 - if 0 <= tindex and tindex < HANGUL_TCOUNT - j = starter_pos + 2 - eoa -= 2 - else - tindex = 0 - j = starter_pos + 1 - eoa -= 1 - end - codepoints[starter_pos..j] = (lindex * HANGUL_VCOUNT + vindex) * HANGUL_TCOUNT + tindex + HANGUL_SBASE - end - starter_pos += 1 - starter_char = codepoints[starter_pos] - # -- Other characters - else - current_char = codepoints[pos] - current = UCD.codepoints[current_char] - if current.combining_class > previous_combining_class - if ref = UCD.composition_map[starter_char] - composition = ref[current_char] - else - composition = nil - end - unless composition.nil? - codepoints[starter_pos] = composition - starter_char = composition - codepoints.delete_at pos - eoa -= 1 - pos -= 1 - previous_combining_class = -1 - else - previous_combining_class = current.combining_class - end - else - previous_combining_class = current.combining_class - end - if current.combining_class == 0 - starter_pos = pos - starter_char = codepoints[pos] - end - end - end - codepoints - end - - def tidy_byte(byte) - if byte < 160 - [UCD.cp1252[byte] || byte].pack("U").unpack("C*") - elsif byte < 192 - [194, byte] - else - [195, byte - 64] - end - end - private :tidy_byte - - # Replaces all ISO-8859-1 or CP1252 characters by their UTF-8 equivalent resulting in a valid UTF-8 string. - # - # Passing +true+ will forcibly tidy all bytes, assuming that the string's encoding is entirely CP1252 or ISO-8859-1. - def tidy_bytes(string, force = false) - if force - return string.unpack("C*").map do |b| - tidy_byte(b) - end.flatten.compact.pack("C*").unpack("U*").pack("U*") - end - - bytes = string.unpack("C*") - conts_expected = 0 - last_lead = 0 - - bytes.each_index do |i| - - byte = bytes[i] - is_ascii = byte < 128 - is_cont = byte > 127 && byte < 192 - is_lead = byte > 191 && byte < 245 - is_unused = byte > 240 - is_restricted = byte > 244 - - # Impossible or highly unlikely byte? Clean it. - if is_unused || is_restricted - bytes[i] = tidy_byte(byte) - elsif is_cont - # Not expecting contination byte? Clean up. Otherwise, now expect one less. - conts_expected == 0 ? bytes[i] = tidy_byte(byte) : conts_expected -= 1 - else - if conts_expected > 0 - # Expected continuation, but got ASCII or leading? Clean backwards up to - # the leading byte. - (1..(i - last_lead)).each {|j| bytes[i - j] = tidy_byte(bytes[i - j])} - conts_expected = 0 - end - if is_lead - # Final byte is leading? Clean it. - if i == bytes.length - 1 - bytes[i] = tidy_byte(bytes.last) - else - # Valid leading byte? Expect continuations determined by position of - # first zero bit, with max of 3. - conts_expected = byte < 224 ? 1 : byte < 240 ? 2 : 3 - last_lead = i - end - end - end - end - bytes.empty? ? "" : bytes.flatten.compact.pack("C*").unpack("U*").pack("U*") - end - - # Returns the KC normalization of the string by default. NFKC is considered the best normalization form for - # passing strings to databases and validations. - # - # * <tt>string</tt> - The string to perform normalization on. - # * <tt>form</tt> - The form you want to normalize in. Should be one of the following: - # <tt>:c</tt>, <tt>:kc</tt>, <tt>:d</tt>, or <tt>:kd</tt>. Default is - # ActiveSupport::Multibyte.default_normalization_form - def normalize(string, form=ActiveSupport::Multibyte.default_normalization_form) - # See http://www.unicode.org/reports/tr15, Table 1 - codepoints = u_unpack(string) - case form - when :d - reorder_characters(decompose_codepoints(:canonical, codepoints)) - when :c - compose_codepoints(reorder_characters(decompose_codepoints(:canonical, codepoints))) - when :kd - reorder_characters(decompose_codepoints(:compatability, codepoints)) - when :kc - compose_codepoints(reorder_characters(decompose_codepoints(:compatability, codepoints))) - else - raise ArgumentError, "#{form} is not a valid normalization variant", caller - end.pack('U*') - end - - end - protected def translate_offset(byte_offset) #:nodoc: @@ -743,26 +451,23 @@ module ActiveSupport #:nodoc: padsize = padsize > 0 ? padsize : 0 case way when :right - result = @wrapped_string.dup.insert(0, self.class.padding(padsize, padstr)) + result = @wrapped_string.dup.insert(0, padding(padsize, padstr)) when :left - result = @wrapped_string.dup.insert(-1, self.class.padding(padsize, padstr)) + result = @wrapped_string.dup.insert(-1, padding(padsize, padstr)) when :center - lpad = self.class.padding((padsize / 2.0).floor, padstr) - rpad = self.class.padding((padsize / 2.0).ceil, padstr) + lpad = padding((padsize / 2.0).floor, padstr) + rpad = padding((padsize / 2.0).ceil, padstr) result = @wrapped_string.dup.insert(0, lpad).insert(-1, rpad) end chars(result) end - def apply_mapping(mapping) #:nodoc: - chars(self.class.u_unpack(@wrapped_string).map do |codepoint| - cp = UCD.codepoints[codepoint] - if cp and (ncp = cp.send(mapping)) and ncp > 0 - ncp - else - codepoint - end - end.pack('U*')) + def padding(padsize, padstr=' ') #:nodoc: + if padsize != 0 + chars(padstr * ((padsize / Unicode.u_unpack(padstr).size) + 1)).slice(0, padsize) + else + '' + end end def chars(string) #:nodoc: diff --git a/activesupport/lib/active_support/multibyte/unicode.rb b/activesupport/lib/active_support/multibyte/unicode.rb new file mode 100644 index 0000000000..f91e50c755 --- /dev/null +++ b/activesupport/lib/active_support/multibyte/unicode.rb @@ -0,0 +1,393 @@ +module ActiveSupport + module Multibyte + module Unicode + + extend self + + # A list of all available normalization forms. See http://www.unicode.org/reports/tr15/tr15-29.html for more + # information about normalization. + NORMALIZATION_FORMS = [:c, :kc, :d, :kd] + + # The Unicode version that is supported by the implementation + UNICODE_VERSION = '5.1.0' + + # The default normalization used for operations that require normalization. It can be set to any of the + # normalizations in NORMALIZATION_FORMS. + # + # Example: + # ActiveSupport::Multibyte::Unicode.default_normalization_form = :c + attr_accessor :default_normalization_form + @default_normalization_form = :kc + + # Hangul character boundaries and properties + HANGUL_SBASE = 0xAC00 + HANGUL_LBASE = 0x1100 + HANGUL_VBASE = 0x1161 + HANGUL_TBASE = 0x11A7 + HANGUL_LCOUNT = 19 + HANGUL_VCOUNT = 21 + HANGUL_TCOUNT = 28 + HANGUL_NCOUNT = HANGUL_VCOUNT * HANGUL_TCOUNT + HANGUL_SCOUNT = 11172 + HANGUL_SLAST = HANGUL_SBASE + HANGUL_SCOUNT + HANGUL_JAMO_FIRST = 0x1100 + HANGUL_JAMO_LAST = 0x11FF + + # All the unicode whitespace + WHITESPACE = [ + (0x0009..0x000D).to_a, # White_Space # Cc [5] <control-0009>..<control-000D> + 0x0020, # White_Space # Zs SPACE + 0x0085, # White_Space # Cc <control-0085> + 0x00A0, # White_Space # Zs NO-BREAK SPACE + 0x1680, # White_Space # Zs OGHAM SPACE MARK + 0x180E, # White_Space # Zs MONGOLIAN VOWEL SEPARATOR + (0x2000..0x200A).to_a, # White_Space # Zs [11] EN QUAD..HAIR SPACE + 0x2028, # White_Space # Zl LINE SEPARATOR + 0x2029, # White_Space # Zp PARAGRAPH SEPARATOR + 0x202F, # White_Space # Zs NARROW NO-BREAK SPACE + 0x205F, # White_Space # Zs MEDIUM MATHEMATICAL SPACE + 0x3000, # White_Space # Zs IDEOGRAPHIC SPACE + ].flatten.freeze + + # BOM (byte order mark) can also be seen as whitespace, it's a non-rendering character used to distinguish + # between little and big endian. This is not an issue in utf-8, so it must be ignored. + LEADERS_AND_TRAILERS = WHITESPACE + [65279] # ZERO-WIDTH NO-BREAK SPACE aka BOM + + # Returns a regular expression pattern that matches the passed Unicode codepoints + def self.codepoints_to_pattern(array_of_codepoints) #:nodoc: + array_of_codepoints.collect{ |e| [e].pack 'U*' }.join('|') + end + TRAILERS_PAT = /(#{codepoints_to_pattern(LEADERS_AND_TRAILERS)})+\Z/u + LEADERS_PAT = /\A(#{codepoints_to_pattern(LEADERS_AND_TRAILERS)})+/u + + # Unpack the string at codepoints boundaries. Raises an EncodingError when the encoding of the string isn't + # valid UTF-8. + # + # Example: + # Unicode.u_unpack('Café') #=> [67, 97, 102, 233] + def u_unpack(string) + begin + string.unpack 'U*' + rescue ArgumentError + raise EncodingError, 'malformed UTF-8 character' + end + end + + # Detect whether the codepoint is in a certain character class. Returns +true+ when it's in the specified + # character class and +false+ otherwise. Valid character classes are: <tt>:cr</tt>, <tt>:lf</tt>, <tt>:l</tt>, + # <tt>:v</tt>, <tt>:lv</tt>, <tt>:lvt</tt> and <tt>:t</tt>. + # + # Primarily used by the grapheme cluster support. + def in_char_class?(codepoint, classes) + classes.detect { |c| database.boundary[c] === codepoint } ? true : false + end + + # Unpack the string at grapheme boundaries. Returns a list of character lists. + # + # Example: + # Unicode.g_unpack('क्षि') #=> [[2325, 2381], [2359], [2367]] + # Unicode.g_unpack('Café') #=> [[67], [97], [102], [233]] + def g_unpack(string) + codepoints = u_unpack(string) + unpacked = [] + pos = 0 + marker = 0 + eoc = codepoints.length + while(pos < eoc) + pos += 1 + previous = codepoints[pos-1] + current = codepoints[pos] + if ( + # CR X LF + one = ( previous == database.boundary[:cr] and current == database.boundary[:lf] ) or + # L X (L|V|LV|LVT) + two = ( database.boundary[:l] === previous and in_char_class?(current, [:l,:v,:lv,:lvt]) ) or + # (LV|V) X (V|T) + three = ( in_char_class?(previous, [:lv,:v]) and in_char_class?(current, [:v,:t]) ) or + # (LVT|T) X (T) + four = ( in_char_class?(previous, [:lvt,:t]) and database.boundary[:t] === current ) or + # X Extend + five = (database.boundary[:extend] === current) + ) + else + unpacked << codepoints[marker..pos-1] + marker = pos + end + end + unpacked + end + + # Reverse operation of g_unpack. + # + # Example: + # Unicode.g_pack(Unicode.g_unpack('क्षि')) #=> 'क्षि' + def g_pack(unpacked) + (unpacked.flatten).pack('U*') + end + + # Re-order codepoints so the string becomes canonical. + def reorder_characters(codepoints) + length = codepoints.length- 1 + pos = 0 + while pos < length do + cp1, cp2 = database.codepoints[codepoints[pos]], database.codepoints[codepoints[pos+1]] + if (cp1.combining_class > cp2.combining_class) && (cp2.combining_class > 0) + codepoints[pos..pos+1] = cp2.code, cp1.code + pos += (pos > 0 ? -1 : 1) + else + pos += 1 + end + end + codepoints + end + + # Decompose composed characters to the decomposed form. + def decompose_codepoints(type, codepoints) + codepoints.inject([]) do |decomposed, cp| + # if it's a hangul syllable starter character + if HANGUL_SBASE <= cp and cp < HANGUL_SLAST + sindex = cp - HANGUL_SBASE + ncp = [] # new codepoints + ncp << HANGUL_LBASE + sindex / HANGUL_NCOUNT + ncp << HANGUL_VBASE + (sindex % HANGUL_NCOUNT) / HANGUL_TCOUNT + tindex = sindex % HANGUL_TCOUNT + ncp << (HANGUL_TBASE + tindex) unless tindex == 0 + decomposed.concat ncp + # if the codepoint is decomposable in with the current decomposition type + elsif (ncp = database.codepoints[cp].decomp_mapping) and (!database.codepoints[cp].decomp_type || type == :compatability) + decomposed.concat decompose_codepoints(type, ncp.dup) + else + decomposed << cp + end + end + end + + # Compose decomposed characters to the composed form. + def compose_codepoints(codepoints) + pos = 0 + eoa = codepoints.length - 1 + starter_pos = 0 + starter_char = codepoints[0] + previous_combining_class = -1 + while pos < eoa + pos += 1 + lindex = starter_char - HANGUL_LBASE + # -- Hangul + if 0 <= lindex and lindex < HANGUL_LCOUNT + vindex = codepoints[starter_pos+1] - HANGUL_VBASE rescue vindex = -1 + if 0 <= vindex and vindex < HANGUL_VCOUNT + tindex = codepoints[starter_pos+2] - HANGUL_TBASE rescue tindex = -1 + if 0 <= tindex and tindex < HANGUL_TCOUNT + j = starter_pos + 2 + eoa -= 2 + else + tindex = 0 + j = starter_pos + 1 + eoa -= 1 + end + codepoints[starter_pos..j] = (lindex * HANGUL_VCOUNT + vindex) * HANGUL_TCOUNT + tindex + HANGUL_SBASE + end + starter_pos += 1 + starter_char = codepoints[starter_pos] + # -- Other characters + else + current_char = codepoints[pos] + current = database.codepoints[current_char] + if current.combining_class > previous_combining_class + if ref = database.composition_map[starter_char] + composition = ref[current_char] + else + composition = nil + end + unless composition.nil? + codepoints[starter_pos] = composition + starter_char = composition + codepoints.delete_at pos + eoa -= 1 + pos -= 1 + previous_combining_class = -1 + else + previous_combining_class = current.combining_class + end + else + previous_combining_class = current.combining_class + end + if current.combining_class == 0 + starter_pos = pos + starter_char = codepoints[pos] + end + end + end + codepoints + end + + # Replaces all ISO-8859-1 or CP1252 characters by their UTF-8 equivalent resulting in a valid UTF-8 string. + # + # Passing +true+ will forcibly tidy all bytes, assuming that the string's encoding is entirely CP1252 or ISO-8859-1. + def tidy_bytes(string, force = false) + if force + return string.unpack("C*").map do |b| + tidy_byte(b) + end.flatten.compact.pack("C*").unpack("U*").pack("U*") + end + + bytes = string.unpack("C*") + conts_expected = 0 + last_lead = 0 + + bytes.each_index do |i| + + byte = bytes[i] + is_ascii = byte < 128 + is_cont = byte > 127 && byte < 192 + is_lead = byte > 191 && byte < 245 + is_unused = byte > 240 + is_restricted = byte > 244 + + # Impossible or highly unlikely byte? Clean it. + if is_unused || is_restricted + bytes[i] = tidy_byte(byte) + elsif is_cont + # Not expecting contination byte? Clean up. Otherwise, now expect one less. + conts_expected == 0 ? bytes[i] = tidy_byte(byte) : conts_expected -= 1 + else + if conts_expected > 0 + # Expected continuation, but got ASCII or leading? Clean backwards up to + # the leading byte. + (1..(i - last_lead)).each {|j| bytes[i - j] = tidy_byte(bytes[i - j])} + conts_expected = 0 + end + if is_lead + # Final byte is leading? Clean it. + if i == bytes.length - 1 + bytes[i] = tidy_byte(bytes.last) + else + # Valid leading byte? Expect continuations determined by position of + # first zero bit, with max of 3. + conts_expected = byte < 224 ? 1 : byte < 240 ? 2 : 3 + last_lead = i + end + end + end + end + bytes.empty? ? "" : bytes.flatten.compact.pack("C*").unpack("U*").pack("U*") + end + + # Returns the KC normalization of the string by default. NFKC is considered the best normalization form for + # passing strings to databases and validations. + # + # * <tt>string</tt> - The string to perform normalization on. + # * <tt>form</tt> - The form you want to normalize in. Should be one of the following: + # <tt>:c</tt>, <tt>:kc</tt>, <tt>:d</tt>, or <tt>:kd</tt>. Default is + # ActiveSupport::Multibyte.default_normalization_form + def normalize(string, form=nil) + form ||= @default_normalization_form + # See http://www.unicode.org/reports/tr15, Table 1 + codepoints = u_unpack(string) + case form + when :d + reorder_characters(decompose_codepoints(:canonical, codepoints)) + when :c + compose_codepoints(reorder_characters(decompose_codepoints(:canonical, codepoints))) + when :kd + reorder_characters(decompose_codepoints(:compatability, codepoints)) + when :kc + compose_codepoints(reorder_characters(decompose_codepoints(:compatability, codepoints))) + else + raise ArgumentError, "#{form} is not a valid normalization variant", caller + end.pack('U*') + end + + def apply_mapping(string, mapping) #:nodoc: + u_unpack(string).map do |codepoint| + cp = database.codepoints[codepoint] + if cp and (ncp = cp.send(mapping)) and ncp > 0 + ncp + else + codepoint + end + end.pack('U*') + end + + # Holds data about a codepoint in the Unicode database + class Codepoint + attr_accessor :code, :combining_class, :decomp_type, :decomp_mapping, :uppercase_mapping, :lowercase_mapping + end + + # Holds static data from the Unicode database + class UnicodeDatabase + ATTRIBUTES = :codepoints, :composition_exclusion, :composition_map, :boundary, :cp1252 + + attr_writer(*ATTRIBUTES) + + def initialize + @codepoints = Hash.new(Codepoint.new) + @composition_exclusion = [] + @composition_map = {} + @boundary = {} + @cp1252 = {} + end + + # Lazy load the Unicode database so it's only loaded when it's actually used + ATTRIBUTES.each do |attr_name| + class_eval(<<-EOS, __FILE__, __LINE__ + 1) + def #{attr_name} # def codepoints + load # load + @#{attr_name} # @codepoints + end # end + EOS + end + + # Loads the Unicode database and returns all the internal objects of UnicodeDatabase. + def load + begin + @codepoints, @composition_exclusion, @composition_map, @boundary, @cp1252 = File.open(self.class.filename, 'rb') { |f| Marshal.load f.read } + rescue Exception => e + raise IOError.new("Couldn't load the Unicode tables for UTF8Handler (#{e.message}), ActiveSupport::Multibyte is unusable") + end + + # Redefine the === method so we can write shorter rules for grapheme cluster breaks + @boundary.each do |k,_| + @boundary[k].instance_eval do + def ===(other) + detect { |i| i === other } ? true : false + end + end if @boundary[k].kind_of?(Array) + end + + # define attr_reader methods for the instance variables + class << self + attr_reader(*ATTRIBUTES) + end + end + + # Returns the directory in which the data files are stored + def self.dirname + File.dirname(__FILE__) + '/../values/' + end + + # Returns the filename for the data file for this version + def self.filename + File.expand_path File.join(dirname, "unicode_tables.dat") + end + end + + private + + def tidy_byte(byte) + if byte < 160 + [database.cp1252[byte] || byte].pack("U").unpack("C*") + elsif byte < 192 + [194, byte] + else + [195, byte - 64] + end + end + + def database + @database ||= UnicodeDatabase.new + end + + end + end +end diff --git a/activesupport/lib/active_support/multibyte/unicode_database.rb b/activesupport/lib/active_support/multibyte/unicode_database.rb deleted file mode 100644 index 074ad8613a..0000000000 --- a/activesupport/lib/active_support/multibyte/unicode_database.rb +++ /dev/null @@ -1,71 +0,0 @@ -# encoding: utf-8 - -module ActiveSupport #:nodoc: - module Multibyte #:nodoc: - # Holds data about a codepoint in the Unicode database - class Codepoint - attr_accessor :code, :combining_class, :decomp_type, :decomp_mapping, :uppercase_mapping, :lowercase_mapping - end - - # Holds static data from the Unicode database - class UnicodeDatabase - ATTRIBUTES = :codepoints, :composition_exclusion, :composition_map, :boundary, :cp1252 - - attr_writer(*ATTRIBUTES) - - def initialize - @codepoints = Hash.new(Codepoint.new) - @composition_exclusion = [] - @composition_map = {} - @boundary = {} - @cp1252 = {} - end - - # Lazy load the Unicode database so it's only loaded when it's actually used - ATTRIBUTES.each do |attr_name| - class_eval(<<-EOS, __FILE__, __LINE__ + 1) - def #{attr_name} # def codepoints - load # load - @#{attr_name} # @codepoints - end # end - EOS - end - - # Loads the Unicode database and returns all the internal objects of UnicodeDatabase. - def load - begin - @codepoints, @composition_exclusion, @composition_map, @boundary, @cp1252 = File.open(self.class.filename, 'rb') { |f| Marshal.load f.read } - rescue Exception => e - raise IOError.new("Couldn't load the Unicode tables for UTF8Handler (#{e.message}), ActiveSupport::Multibyte is unusable") - end - - # Redefine the === method so we can write shorter rules for grapheme cluster breaks - @boundary.each do |k,_| - @boundary[k].instance_eval do - def ===(other) - detect { |i| i === other } ? true : false - end - end if @boundary[k].kind_of?(Array) - end - - # define attr_reader methods for the instance variables - class << self - attr_reader(*ATTRIBUTES) - end - end - - # Returns the directory in which the data files are stored - def self.dirname - File.dirname(__FILE__) + '/../values/' - end - - # Returns the filename for the data file for this version - def self.filename - File.expand_path File.join(dirname, "unicode_tables.dat") - end - end - - # UniCode Database - UCD = UnicodeDatabase.new - end -end
\ No newline at end of file diff --git a/activesupport/lib/active_support/railtie.rb b/activesupport/lib/active_support/railtie.rb index 0243157e35..59f9ab18b1 100644 --- a/activesupport/lib/active_support/railtie.rb +++ b/activesupport/lib/active_support/railtie.rb @@ -48,19 +48,20 @@ module I18n # Set the i18n configuration from config.i18n but special-case for # the load_path which should be appended to what's already set instead of overwritten. config.after_initialize do |app| + fallbacks = app.config.i18n.delete(:fallbacks) + app.config.i18n.each do |setting, value| case setting when :railties_load_path app.config.i18n.load_path.unshift(*value) when :load_path I18n.load_path += value - when :fallbacks - init_fallbacks(value) if value && validate_fallbacks(value) else I18n.send("#{setting}=", value) end end + init_fallbacks(fallbacks) if fallbacks && validate_fallbacks(fallbacks) I18n.reload! end diff --git a/activesupport/lib/active_support/testing/declarative.rb b/activesupport/lib/active_support/testing/declarative.rb index a7df473644..70a6c2ca60 100644 --- a/activesupport/lib/active_support/testing/declarative.rb +++ b/activesupport/lib/active_support/testing/declarative.rb @@ -7,7 +7,7 @@ module ActiveSupport unless method_defined?(:describe) def self.describe(text) - class_eval <<-RUBY_EVAL + class_eval <<-RUBY_EVAL, __FILE__, __LINE__ + 1 def self.name "#{text}" end diff --git a/activesupport/lib/active_support/values/unicode_tables.dat b/activesupport/lib/active_support/values/unicode_tables.dat Binary files differindex 74b333d416..10f2cae465 100644 --- a/activesupport/lib/active_support/values/unicode_tables.dat +++ b/activesupport/lib/active_support/values/unicode_tables.dat |