From f3abc8ac36055afed9fcc902c33ee146e066d17a Mon Sep 17 00:00:00 2001 From: Norman Clarke Date: Mon, 10 May 2010 10:46:37 -0300 Subject: Use multibyte proxy class on 1.9, refactor Unicode. Makes String#mb_chars on Ruby 1.9 return an instance of ActiveSupport::Multibyte::Chars to work around 1.9's lack of Unicode case folding. Refactors class methods from ActiveSupport::Multibyte::Chars into new Unicode module, adding other related functionality for consistency. [#4594 state:resolved] Signed-off-by: Jeremy Kemper --- .../active_support/core_ext/string/multibyte.rb | 30 +- .../lib/active_support/inflector/transliterate.rb | 5 +- activesupport/lib/active_support/multibyte.rb | 20 +- .../lib/active_support/multibyte/chars.rb | 577 +++++---------------- .../lib/active_support/multibyte/unicode.rb | 393 ++++++++++++++ .../active_support/multibyte/unicode_database.rb | 71 --- .../lib/active_support/values/unicode_tables.dat | Bin 710734 -> 710743 bytes 7 files changed, 555 insertions(+), 541 deletions(-) create mode 100644 activesupport/lib/active_support/multibyte/unicode.rb delete mode 100644 activesupport/lib/active_support/multibyte/unicode_database.rb (limited to 'activesupport/lib/active_support') diff --git a/activesupport/lib/active_support/core_ext/string/multibyte.rb b/activesupport/lib/active_support/core_ext/string/multibyte.rb index 42e053d0f8..3dfe996d06 100644 --- a/activesupport/lib/active_support/core_ext/string/multibyte.rb +++ b/activesupport/lib/active_support/core_ext/string/multibyte.rb @@ -2,7 +2,7 @@ require 'active_support/multibyte' class String - unless '1.9'.respond_to?(:force_encoding) + if '1.9'.respond_to?(:force_encoding) # == Multibyte proxy # # +mb_chars+ is a multibyte safe proxy for string methods. @@ -37,23 +37,13 @@ class String # For more information about the methods defined on the Chars proxy see ActiveSupport::Multibyte::Chars. For # information about how to change the default Multibyte behaviour see ActiveSupport::Multibyte. def mb_chars - if ActiveSupport::Multibyte.proxy_class.wants?(self) + if ActiveSupport::Multibyte.proxy_class.consumes?(self) ActiveSupport::Multibyte.proxy_class.new(self) else self end end - - # Returns true if the string has UTF-8 semantics (a String used for purely byte resources is unlikely to have - # them), returns false otherwise. - def is_utf8? - ActiveSupport::Multibyte::Chars.consumes?(self) - end - else - def mb_chars #:nodoc - self - end - + def is_utf8? #:nodoc case encoding when Encoding::UTF_8 @@ -64,5 +54,19 @@ class String false end end + else + def mb_chars + if ActiveSupport::Multibyte.proxy_class.wants?(self) + ActiveSupport::Multibyte.proxy_class.new(self) + else + self + end + end + + # Returns true if the string has UTF-8 semantics (a String used for purely byte resources is unlikely to have + # them), returns false otherwise. + def is_utf8? + ActiveSupport::Multibyte::Chars.consumes?(self) + end end end diff --git a/activesupport/lib/active_support/inflector/transliterate.rb b/activesupport/lib/active_support/inflector/transliterate.rb index 2344bb1bb3..bccc5425a6 100644 --- a/activesupport/lib/active_support/inflector/transliterate.rb +++ b/activesupport/lib/active_support/inflector/transliterate.rb @@ -58,8 +58,9 @@ module ActiveSupport # transliterate("Jürgen") # # => "Juergen" def transliterate(string, replacement = "?") - I18n.transliterate(Multibyte::Chars.normalize( - Multibyte::Chars.tidy_bytes(string), :c), :replacement => replacement) + I18n.transliterate(ActiveSupport::Multibyte::Unicode.normalize( + ActiveSupport::Multibyte::Unicode.tidy_bytes(string), :c), + :replacement => replacement) end # Replaces special characters in a string so that it may be used as part of a 'pretty' URL. diff --git a/activesupport/lib/active_support/multibyte.rb b/activesupport/lib/active_support/multibyte.rb index 428c48a484..e7a271a660 100644 --- a/activesupport/lib/active_support/multibyte.rb +++ b/activesupport/lib/active_support/multibyte.rb @@ -1,30 +1,12 @@ # encoding: utf-8 - require 'active_support/core_ext/module/attribute_accessors' module ActiveSupport #:nodoc: module Multibyte autoload :EncodingError, 'active_support/multibyte/exceptions' autoload :Chars, 'active_support/multibyte/chars' - autoload :UnicodeDatabase, 'active_support/multibyte/unicode_database' - autoload :Codepoint, 'active_support/multibyte/unicode_database' - autoload :UCD, 'active_support/multibyte/unicode_database' + autoload :Unicode, 'active_support/multibyte/unicode' - # A list of all available normalization forms. See http://www.unicode.org/reports/tr15/tr15-29.html for more - # information about normalization. - NORMALIZATION_FORMS = [:c, :kc, :d, :kd] - - # The Unicode version that is supported by the implementation - UNICODE_VERSION = '5.1.0' - - # The default normalization used for operations that require normalization. It can be set to any of the - # normalizations in NORMALIZATION_FORMS. - # - # Example: - # ActiveSupport::Multibyte.default_normalization_form = :c - mattr_accessor :default_normalization_form - self.default_normalization_form = :kc - # The proxy class returned when calling mb_chars. You can use this accessor to configure your own proxy # class so you can support other encodings. See the ActiveSupport::Multibyte::Chars implementation for # an example how to do this. diff --git a/activesupport/lib/active_support/multibyte/chars.rb b/activesupport/lib/active_support/multibyte/chars.rb index cca30d1141..429b65bf15 100644 --- a/activesupport/lib/active_support/multibyte/chars.rb +++ b/activesupport/lib/active_support/multibyte/chars.rb @@ -34,52 +34,12 @@ module ActiveSupport #:nodoc: # # ActiveSupport::Multibyte.proxy_class = CharsForUTF32 class Chars - # Hangul character boundaries and properties - HANGUL_SBASE = 0xAC00 - HANGUL_LBASE = 0x1100 - HANGUL_VBASE = 0x1161 - HANGUL_TBASE = 0x11A7 - HANGUL_LCOUNT = 19 - HANGUL_VCOUNT = 21 - HANGUL_TCOUNT = 28 - HANGUL_NCOUNT = HANGUL_VCOUNT * HANGUL_TCOUNT - HANGUL_SCOUNT = 11172 - HANGUL_SLAST = HANGUL_SBASE + HANGUL_SCOUNT - HANGUL_JAMO_FIRST = 0x1100 - HANGUL_JAMO_LAST = 0x11FF - - # All the unicode whitespace - UNICODE_WHITESPACE = [ - (0x0009..0x000D).to_a, # White_Space # Cc [5] .. - 0x0020, # White_Space # Zs SPACE - 0x0085, # White_Space # Cc - 0x00A0, # White_Space # Zs NO-BREAK SPACE - 0x1680, # White_Space # Zs OGHAM SPACE MARK - 0x180E, # White_Space # Zs MONGOLIAN VOWEL SEPARATOR - (0x2000..0x200A).to_a, # White_Space # Zs [11] EN QUAD..HAIR SPACE - 0x2028, # White_Space # Zl LINE SEPARATOR - 0x2029, # White_Space # Zp PARAGRAPH SEPARATOR - 0x202F, # White_Space # Zs NARROW NO-BREAK SPACE - 0x205F, # White_Space # Zs MEDIUM MATHEMATICAL SPACE - 0x3000, # White_Space # Zs IDEOGRAPHIC SPACE - ].flatten.freeze - - # BOM (byte order mark) can also be seen as whitespace, it's a non-rendering character used to distinguish - # between little and big endian. This is not an issue in utf-8, so it must be ignored. - UNICODE_LEADERS_AND_TRAILERS = UNICODE_WHITESPACE + [65279] # ZERO-WIDTH NO-BREAK SPACE aka BOM - - # Returns a regular expression pattern that matches the passed Unicode codepoints - def self.codepoints_to_pattern(array_of_codepoints) #:nodoc: - array_of_codepoints.collect{ |e| [e].pack 'U*' }.join('|') - end - UNICODE_TRAILERS_PAT = /(#{codepoints_to_pattern(UNICODE_LEADERS_AND_TRAILERS)})+\Z/u - UNICODE_LEADERS_PAT = /\A(#{codepoints_to_pattern(UNICODE_LEADERS_AND_TRAILERS)})+/u attr_reader :wrapped_string alias to_s wrapped_string alias to_str wrapped_string - if '1.9'.respond_to?(:force_encoding) + if RUBY_VERSION >= "1.9" # Creates a new Chars instance by wrapping _string_. def initialize(string) @wrapped_string = string @@ -113,12 +73,6 @@ module ActiveSupport #:nodoc: true end - # Returns +true+ if the Chars class can and should act as a proxy for the string _string_. Returns - # +false+ otherwise. - def self.wants?(string) - $KCODE == 'UTF8' && consumes?(string) - end - # Returns +true+ when the proxy class can handle the string. Returns +false+ otherwise. def self.consumes?(string) # Unpack is a little bit faster than regular expressions. @@ -130,89 +84,131 @@ module ActiveSupport #:nodoc: include Comparable - # Returns -1, 0 or +1 depending on whether the Chars object is to be sorted before, - # equal or after the object on the right side of the operation. It accepts any object that implements +to_s+. - # See String#<=> for more details. - # - # Example: - # 'é'.mb_chars <=> 'ü'.mb_chars #=> -1 - def <=>(other) - @wrapped_string <=> other.to_s - end + if RUBY_VERSION < "1.9" + # Returns +true+ if the Chars class can and should act as a proxy for the string _string_. Returns + # +false+ otherwise. + def self.wants?(string) + $KCODE == 'UTF8' && consumes?(string) + end - # Returns a new Chars object containing the _other_ object concatenated to the string. - # - # Example: - # ('Café'.mb_chars + ' périferôl').to_s #=> "Café périferôl" - def +(other) - self << other - end + # Returns -1, 0 or +1 depending on whether the Chars object is to be sorted before, + # equal or after the object on the right side of the operation. It accepts any object that implements +to_s+. + # See String#<=> for more details. + # + # Example: + # 'é'.mb_chars <=> 'ü'.mb_chars #=> -1 + def <=>(other) + @wrapped_string <=> other.to_s + end - # Like String#=~ only it returns the character offset (in codepoints) instead of the byte offset. - # - # Example: - # 'Café périferôl'.mb_chars =~ /ô/ #=> 12 - def =~(other) - translate_offset(@wrapped_string =~ other) - end + # Returns a new Chars object containing the _other_ object concatenated to the string. + # + # Example: + # ('Café'.mb_chars + ' périferôl').to_s #=> "Café périferôl" + def +(other) + self << other + end - # Works just like String#split, with the exception that the items in the resulting list are Chars - # instances instead of String. This makes chaining methods easier. - # - # Example: - # 'Café périferôl'.mb_chars.split(/é/).map { |part| part.upcase.to_s } #=> ["CAF", " P", "RIFERÔL"] - def split(*args) - @wrapped_string.split(*args).map { |i| i.mb_chars } - end + # Like String#=~ only it returns the character offset (in codepoints) instead of the byte offset. + # + # Example: + # 'Café périferôl'.mb_chars =~ /ô/ #=> 12 + def =~(other) + translate_offset(@wrapped_string =~ other) + end - # Inserts the passed string at specified codepoint offsets. - # - # Example: - # 'Café'.mb_chars.insert(4, ' périferôl').to_s #=> "Café périferôl" - def insert(offset, fragment) - unpacked = self.class.u_unpack(@wrapped_string) - unless offset > unpacked.length - @wrapped_string.replace( - self.class.u_unpack(@wrapped_string).insert(offset, *self.class.u_unpack(fragment)).pack('U*') - ) - else - raise IndexError, "index #{offset} out of string" + # Inserts the passed string at specified codepoint offsets. + # + # Example: + # 'Café'.mb_chars.insert(4, ' périferôl').to_s #=> "Café périferôl" + def insert(offset, fragment) + unpacked = Unicode.u_unpack(@wrapped_string) + unless offset > unpacked.length + @wrapped_string.replace( + Unicode.u_unpack(@wrapped_string).insert(offset, *Unicode.u_unpack(fragment)).pack('U*') + ) + else + raise IndexError, "index #{offset} out of string" + end + self end - self - end - # Returns +true+ if contained string contains _other_. Returns +false+ otherwise. - # - # Example: - # 'Café'.mb_chars.include?('é') #=> true - def include?(other) - # We have to redefine this method because Enumerable defines it. - @wrapped_string.include?(other) - end + # Returns +true+ if contained string contains _other_. Returns +false+ otherwise. + # + # Example: + # 'Café'.mb_chars.include?('é') #=> true + def include?(other) + # We have to redefine this method because Enumerable defines it. + @wrapped_string.include?(other) + end - # Returns the position _needle_ in the string, counting in codepoints. Returns +nil+ if _needle_ isn't found. - # - # Example: - # 'Café périferôl'.mb_chars.index('ô') #=> 12 - # 'Café périferôl'.mb_chars.index(/\w/u) #=> 0 - def index(needle, offset=0) - wrapped_offset = first(offset).wrapped_string.length - index = @wrapped_string.index(needle, wrapped_offset) - index ? (self.class.u_unpack(@wrapped_string.slice(0...index)).size) : nil + # Returns the position _needle_ in the string, counting in codepoints. Returns +nil+ if _needle_ isn't found. + # + # Example: + # 'Café périferôl'.mb_chars.index('ô') #=> 12 + # 'Café périferôl'.mb_chars.index(/\w/u) #=> 0 + def index(needle, offset=0) + wrapped_offset = first(offset).wrapped_string.length + index = @wrapped_string.index(needle, wrapped_offset) + index ? (Unicode.u_unpack(@wrapped_string.slice(0...index)).size) : nil + end + + # Returns the position _needle_ in the string, counting in + # codepoints, searching backward from _offset_ or the end of the + # string. Returns +nil+ if _needle_ isn't found. + # + # Example: + # 'Café périferôl'.mb_chars.rindex('é') #=> 6 + # 'Café périferôl'.mb_chars.rindex(/\w/u) #=> 13 + def rindex(needle, offset=nil) + offset ||= length + wrapped_offset = first(offset).wrapped_string.length + index = @wrapped_string.rindex(needle, wrapped_offset) + index ? (Unicode.u_unpack(@wrapped_string.slice(0...index)).size) : nil + end + + # Returns the number of codepoints in the string + def size + Unicode.u_unpack(@wrapped_string).size + end + alias_method :length, :size + + # Strips entire range of Unicode whitespace from the right of the string. + def rstrip + chars(@wrapped_string.gsub(Unicode::TRAILERS_PAT, '')) + end + + # Strips entire range of Unicode whitespace from the left of the string. + def lstrip + chars(@wrapped_string.gsub(Unicode::LEADERS_PAT, '')) + end + + # Strips entire range of Unicode whitespace from the right and left of the string. + def strip + rstrip.lstrip + end + + # Returns the codepoint of the first character in the string. + # + # Example: + # 'こんにちは'.mb_chars.ord #=> 12371 + def ord + Unicode.u_unpack(@wrapped_string)[0] + end + + else + def =~(other) + @wrapped_string =~ other + end end - # Returns the position _needle_ in the string, counting in - # codepoints, searching backward from _offset_ or the end of the - # string. Returns +nil+ if _needle_ isn't found. + # Works just like String#split, with the exception that the items in the resulting list are Chars + # instances instead of String. This makes chaining methods easier. # # Example: - # 'Café périferôl'.mb_chars.rindex('é') #=> 6 - # 'Café périferôl'.mb_chars.rindex(/\w/u) #=> 13 - def rindex(needle, offset=nil) - offset ||= length - wrapped_offset = first(offset).wrapped_string.length - index = @wrapped_string.rindex(needle, wrapped_offset) - index ? (self.class.u_unpack(@wrapped_string.slice(0...index)).size) : nil + # 'Café périferôl'.mb_chars.split(/é/).map { |part| part.upcase.to_s } #=> ["CAF", " P", "RIFERÔL"] + def split(*args) + @wrapped_string.split(*args).map { |i| i.mb_chars } end # Like String#[]=, except instead of byte offsets you specify character offsets. @@ -234,7 +230,7 @@ module ActiveSupport #:nodoc: if args.first.is_a?(Regexp) @wrapped_string[*args] = replace_by else - result = self.class.u_unpack(@wrapped_string) + result = Unicode.u_unpack(@wrapped_string) if args[0].is_a?(Fixnum) raise IndexError, "index #{args[0]} out of string" if args[0] >= result.length min = args[0] @@ -247,10 +243,10 @@ module ActiveSupport #:nodoc: else needle = args[0].to_s min = index(needle) - max = min + self.class.u_unpack(needle).length - 1 + max = min + Unicode.u_unpack(needle).length - 1 range = Range.new(min, max) end - result[range] = self.class.u_unpack(replace_by) + result[range] = Unicode.u_unpack(replace_by) @wrapped_string.replace(result.pack('U*')) end end @@ -294,33 +290,13 @@ module ActiveSupport #:nodoc: justify(integer, :center, padstr) end - # Strips entire range of Unicode whitespace from the right of the string. - def rstrip - chars(@wrapped_string.gsub(UNICODE_TRAILERS_PAT, '')) - end - - # Strips entire range of Unicode whitespace from the left of the string. - def lstrip - chars(@wrapped_string.gsub(UNICODE_LEADERS_PAT, '')) - end - - # Strips entire range of Unicode whitespace from the right and left of the string. - def strip - rstrip.lstrip - end - - # Returns the number of codepoints in the string - def size - self.class.u_unpack(@wrapped_string).size - end - alias_method :length, :size # Reverses all characters in the string. # # Example: # 'Café'.mb_chars.reverse.to_s #=> 'éfaC' def reverse - chars(self.class.g_unpack(@wrapped_string).reverse.flatten.pack('U*')) + chars(Unicode.g_unpack(@wrapped_string).reverse.flatten.pack('U*')) end # Implements Unicode-aware slice with codepoints. Slicing on one point returns the codepoints for that @@ -336,15 +312,15 @@ module ActiveSupport #:nodoc: elsif (args.size == 2 && !args[1].is_a?(Numeric)) raise TypeError, "cannot convert #{args[1].class} into Integer" # Do as if we were native elsif args[0].kind_of? Range - cps = self.class.u_unpack(@wrapped_string).slice(*args) + cps = Unicode.u_unpack(@wrapped_string).slice(*args) result = cps.nil? ? nil : cps.pack('U*') elsif args[0].kind_of? Regexp result = @wrapped_string.slice(*args) elsif args.size == 1 && args[0].kind_of?(Numeric) - character = self.class.u_unpack(@wrapped_string)[args[0]] + character = Unicode.u_unpack(@wrapped_string)[args[0]] result = character.nil? ? nil : [character].pack('U') else - result = self.class.u_unpack(@wrapped_string).slice(*args).pack('U*') + result = Unicode.u_unpack(@wrapped_string).slice(*args).pack('U*') end result.nil? ? nil : chars(result) end @@ -372,20 +348,12 @@ module ActiveSupport #:nodoc: slice(0...translate_offset(limit)) end - # Returns the codepoint of the first character in the string. - # - # Example: - # 'こんにちは'.mb_chars.ord #=> 12371 - def ord - self.class.u_unpack(@wrapped_string)[0] - end - # Convert characters in the string to uppercase. # # Example: # 'Laurent, où sont les tests ?'.mb_chars.upcase.to_s #=> "LAURENT, OÙ SONT LES TESTS ?" def upcase - apply_mapping :uppercase_mapping + chars(Unicode.apply_mapping @wrapped_string, :uppercase_mapping) end # Convert characters in the string to lowercase. @@ -393,7 +361,7 @@ module ActiveSupport #:nodoc: # Example: # 'VĚDA A VÝZKUM'.mb_chars.downcase.to_s #=> "věda a výzkum" def downcase - apply_mapping :lowercase_mapping + chars(Unicode.apply_mapping @wrapped_string, :lowercase_mapping) end # Converts the first character to uppercase and the remainder to lowercase. @@ -409,9 +377,9 @@ module ActiveSupport #:nodoc: # # * form - The form you want to normalize in. Should be one of the following: # :c, :kc, :d, or :kd. Default is - # ActiveSupport::Multibyte.default_normalization_form - def normalize(form=ActiveSupport::Multibyte.default_normalization_form) - chars(self.class.normalize(@wrapped_string, form)) + # ActiveSupport::Multibyte::Unicode.default_normalization_form + def normalize(form = nil) + chars(Unicode.normalize(@wrapped_string, form)) end # Performs canonical decomposition on all the characters. @@ -420,7 +388,7 @@ module ActiveSupport #:nodoc: # 'é'.length #=> 2 # 'é'.mb_chars.decompose.to_s.length #=> 3 def decompose - chars(self.class.decompose_codepoints(:canonical, self.class.u_unpack(@wrapped_string)).pack('U*')) + chars(Unicode.decompose_codepoints(:canonical, Unicode.u_unpack(@wrapped_string)).pack('U*')) end # Performs composition on all the characters. @@ -429,7 +397,7 @@ module ActiveSupport #:nodoc: # 'é'.length #=> 3 # 'é'.mb_chars.compose.to_s.length #=> 2 def compose - chars(self.class.compose_codepoints(self.class.u_unpack(@wrapped_string)).pack('U*')) + chars(Unicode.compose_codepoints(Unicode.u_unpack(@wrapped_string)).pack('U*')) end # Returns the number of grapheme clusters in the string. @@ -438,14 +406,14 @@ module ActiveSupport #:nodoc: # 'क्षि'.mb_chars.length #=> 4 # 'क्षि'.mb_chars.g_length #=> 3 def g_length - self.class.g_unpack(@wrapped_string).length + Unicode.g_unpack(@wrapped_string).length end # Replaces all ISO-8859-1 or CP1252 characters by their UTF-8 equivalent resulting in a valid UTF-8 string. # # Passing +true+ will forcibly tidy all bytes, assuming that the string's encoding is entirely CP1252 or ISO-8859-1. def tidy_bytes(force = false) - chars(self.class.tidy_bytes(@wrapped_string, force)) + chars(Unicode.tidy_bytes(@wrapped_string, force)) end %w(lstrip rstrip strip reverse upcase downcase tidy_bytes capitalize).each do |method| @@ -459,266 +427,6 @@ module ActiveSupport #:nodoc: end end - class << self - - # Unpack the string at codepoints boundaries. Raises an EncodingError when the encoding of the string isn't - # valid UTF-8. - # - # Example: - # Chars.u_unpack('Café') #=> [67, 97, 102, 233] - def u_unpack(string) - begin - string.unpack 'U*' - rescue ArgumentError - raise EncodingError, 'malformed UTF-8 character' - end - end - - # Detect whether the codepoint is in a certain character class. Returns +true+ when it's in the specified - # character class and +false+ otherwise. Valid character classes are: :cr, :lf, :l, - # :v, :lv, :lvt and :t. - # - # Primarily used by the grapheme cluster support. - def in_char_class?(codepoint, classes) - classes.detect { |c| UCD.boundary[c] === codepoint } ? true : false - end - - # Unpack the string at grapheme boundaries. Returns a list of character lists. - # - # Example: - # Chars.g_unpack('क्षि') #=> [[2325, 2381], [2359], [2367]] - # Chars.g_unpack('Café') #=> [[67], [97], [102], [233]] - def g_unpack(string) - codepoints = u_unpack(string) - unpacked = [] - pos = 0 - marker = 0 - eoc = codepoints.length - while(pos < eoc) - pos += 1 - previous = codepoints[pos-1] - current = codepoints[pos] - if ( - # CR X LF - one = ( previous == UCD.boundary[:cr] and current == UCD.boundary[:lf] ) or - # L X (L|V|LV|LVT) - two = ( UCD.boundary[:l] === previous and in_char_class?(current, [:l,:v,:lv,:lvt]) ) or - # (LV|V) X (V|T) - three = ( in_char_class?(previous, [:lv,:v]) and in_char_class?(current, [:v,:t]) ) or - # (LVT|T) X (T) - four = ( in_char_class?(previous, [:lvt,:t]) and UCD.boundary[:t] === current ) or - # X Extend - five = (UCD.boundary[:extend] === current) - ) - else - unpacked << codepoints[marker..pos-1] - marker = pos - end - end - unpacked - end - - # Reverse operation of g_unpack. - # - # Example: - # Chars.g_pack(Chars.g_unpack('क्षि')) #=> 'क्षि' - def g_pack(unpacked) - (unpacked.flatten).pack('U*') - end - - def padding(padsize, padstr=' ') #:nodoc: - if padsize != 0 - new(padstr * ((padsize / u_unpack(padstr).size) + 1)).slice(0, padsize) - else - '' - end - end - - # Re-order codepoints so the string becomes canonical. - def reorder_characters(codepoints) - length = codepoints.length- 1 - pos = 0 - while pos < length do - cp1, cp2 = UCD.codepoints[codepoints[pos]], UCD.codepoints[codepoints[pos+1]] - if (cp1.combining_class > cp2.combining_class) && (cp2.combining_class > 0) - codepoints[pos..pos+1] = cp2.code, cp1.code - pos += (pos > 0 ? -1 : 1) - else - pos += 1 - end - end - codepoints - end - - # Decompose composed characters to the decomposed form. - def decompose_codepoints(type, codepoints) - codepoints.inject([]) do |decomposed, cp| - # if it's a hangul syllable starter character - if HANGUL_SBASE <= cp and cp < HANGUL_SLAST - sindex = cp - HANGUL_SBASE - ncp = [] # new codepoints - ncp << HANGUL_LBASE + sindex / HANGUL_NCOUNT - ncp << HANGUL_VBASE + (sindex % HANGUL_NCOUNT) / HANGUL_TCOUNT - tindex = sindex % HANGUL_TCOUNT - ncp << (HANGUL_TBASE + tindex) unless tindex == 0 - decomposed.concat ncp - # if the codepoint is decomposable in with the current decomposition type - elsif (ncp = UCD.codepoints[cp].decomp_mapping) and (!UCD.codepoints[cp].decomp_type || type == :compatability) - decomposed.concat decompose_codepoints(type, ncp.dup) - else - decomposed << cp - end - end - end - - # Compose decomposed characters to the composed form. - def compose_codepoints(codepoints) - pos = 0 - eoa = codepoints.length - 1 - starter_pos = 0 - starter_char = codepoints[0] - previous_combining_class = -1 - while pos < eoa - pos += 1 - lindex = starter_char - HANGUL_LBASE - # -- Hangul - if 0 <= lindex and lindex < HANGUL_LCOUNT - vindex = codepoints[starter_pos+1] - HANGUL_VBASE rescue vindex = -1 - if 0 <= vindex and vindex < HANGUL_VCOUNT - tindex = codepoints[starter_pos+2] - HANGUL_TBASE rescue tindex = -1 - if 0 <= tindex and tindex < HANGUL_TCOUNT - j = starter_pos + 2 - eoa -= 2 - else - tindex = 0 - j = starter_pos + 1 - eoa -= 1 - end - codepoints[starter_pos..j] = (lindex * HANGUL_VCOUNT + vindex) * HANGUL_TCOUNT + tindex + HANGUL_SBASE - end - starter_pos += 1 - starter_char = codepoints[starter_pos] - # -- Other characters - else - current_char = codepoints[pos] - current = UCD.codepoints[current_char] - if current.combining_class > previous_combining_class - if ref = UCD.composition_map[starter_char] - composition = ref[current_char] - else - composition = nil - end - unless composition.nil? - codepoints[starter_pos] = composition - starter_char = composition - codepoints.delete_at pos - eoa -= 1 - pos -= 1 - previous_combining_class = -1 - else - previous_combining_class = current.combining_class - end - else - previous_combining_class = current.combining_class - end - if current.combining_class == 0 - starter_pos = pos - starter_char = codepoints[pos] - end - end - end - codepoints - end - - def tidy_byte(byte) - if byte < 160 - [UCD.cp1252[byte] || byte].pack("U").unpack("C*") - elsif byte < 192 - [194, byte] - else - [195, byte - 64] - end - end - private :tidy_byte - - # Replaces all ISO-8859-1 or CP1252 characters by their UTF-8 equivalent resulting in a valid UTF-8 string. - # - # Passing +true+ will forcibly tidy all bytes, assuming that the string's encoding is entirely CP1252 or ISO-8859-1. - def tidy_bytes(string, force = false) - if force - return string.unpack("C*").map do |b| - tidy_byte(b) - end.flatten.compact.pack("C*").unpack("U*").pack("U*") - end - - bytes = string.unpack("C*") - conts_expected = 0 - last_lead = 0 - - bytes.each_index do |i| - - byte = bytes[i] - is_ascii = byte < 128 - is_cont = byte > 127 && byte < 192 - is_lead = byte > 191 && byte < 245 - is_unused = byte > 240 - is_restricted = byte > 244 - - # Impossible or highly unlikely byte? Clean it. - if is_unused || is_restricted - bytes[i] = tidy_byte(byte) - elsif is_cont - # Not expecting contination byte? Clean up. Otherwise, now expect one less. - conts_expected == 0 ? bytes[i] = tidy_byte(byte) : conts_expected -= 1 - else - if conts_expected > 0 - # Expected continuation, but got ASCII or leading? Clean backwards up to - # the leading byte. - (1..(i - last_lead)).each {|j| bytes[i - j] = tidy_byte(bytes[i - j])} - conts_expected = 0 - end - if is_lead - # Final byte is leading? Clean it. - if i == bytes.length - 1 - bytes[i] = tidy_byte(bytes.last) - else - # Valid leading byte? Expect continuations determined by position of - # first zero bit, with max of 3. - conts_expected = byte < 224 ? 1 : byte < 240 ? 2 : 3 - last_lead = i - end - end - end - end - bytes.empty? ? "" : bytes.flatten.compact.pack("C*").unpack("U*").pack("U*") - end - - # Returns the KC normalization of the string by default. NFKC is considered the best normalization form for - # passing strings to databases and validations. - # - # * string - The string to perform normalization on. - # * form - The form you want to normalize in. Should be one of the following: - # :c, :kc, :d, or :kd. Default is - # ActiveSupport::Multibyte.default_normalization_form - def normalize(string, form=ActiveSupport::Multibyte.default_normalization_form) - # See http://www.unicode.org/reports/tr15, Table 1 - codepoints = u_unpack(string) - case form - when :d - reorder_characters(decompose_codepoints(:canonical, codepoints)) - when :c - compose_codepoints(reorder_characters(decompose_codepoints(:canonical, codepoints))) - when :kd - reorder_characters(decompose_codepoints(:compatability, codepoints)) - when :kc - compose_codepoints(reorder_characters(decompose_codepoints(:compatability, codepoints))) - else - raise ArgumentError, "#{form} is not a valid normalization variant", caller - end.pack('U*') - end - - end - protected def translate_offset(byte_offset) #:nodoc: @@ -743,26 +451,23 @@ module ActiveSupport #:nodoc: padsize = padsize > 0 ? padsize : 0 case way when :right - result = @wrapped_string.dup.insert(0, self.class.padding(padsize, padstr)) + result = @wrapped_string.dup.insert(0, padding(padsize, padstr)) when :left - result = @wrapped_string.dup.insert(-1, self.class.padding(padsize, padstr)) + result = @wrapped_string.dup.insert(-1, padding(padsize, padstr)) when :center - lpad = self.class.padding((padsize / 2.0).floor, padstr) - rpad = self.class.padding((padsize / 2.0).ceil, padstr) + lpad = padding((padsize / 2.0).floor, padstr) + rpad = padding((padsize / 2.0).ceil, padstr) result = @wrapped_string.dup.insert(0, lpad).insert(-1, rpad) end chars(result) end - def apply_mapping(mapping) #:nodoc: - chars(self.class.u_unpack(@wrapped_string).map do |codepoint| - cp = UCD.codepoints[codepoint] - if cp and (ncp = cp.send(mapping)) and ncp > 0 - ncp - else - codepoint - end - end.pack('U*')) + def padding(padsize, padstr=' ') #:nodoc: + if padsize != 0 + chars(padstr * ((padsize / Unicode.u_unpack(padstr).size) + 1)).slice(0, padsize) + else + '' + end end def chars(string) #:nodoc: diff --git a/activesupport/lib/active_support/multibyte/unicode.rb b/activesupport/lib/active_support/multibyte/unicode.rb new file mode 100644 index 0000000000..f91e50c755 --- /dev/null +++ b/activesupport/lib/active_support/multibyte/unicode.rb @@ -0,0 +1,393 @@ +module ActiveSupport + module Multibyte + module Unicode + + extend self + + # A list of all available normalization forms. See http://www.unicode.org/reports/tr15/tr15-29.html for more + # information about normalization. + NORMALIZATION_FORMS = [:c, :kc, :d, :kd] + + # The Unicode version that is supported by the implementation + UNICODE_VERSION = '5.1.0' + + # The default normalization used for operations that require normalization. It can be set to any of the + # normalizations in NORMALIZATION_FORMS. + # + # Example: + # ActiveSupport::Multibyte::Unicode.default_normalization_form = :c + attr_accessor :default_normalization_form + @default_normalization_form = :kc + + # Hangul character boundaries and properties + HANGUL_SBASE = 0xAC00 + HANGUL_LBASE = 0x1100 + HANGUL_VBASE = 0x1161 + HANGUL_TBASE = 0x11A7 + HANGUL_LCOUNT = 19 + HANGUL_VCOUNT = 21 + HANGUL_TCOUNT = 28 + HANGUL_NCOUNT = HANGUL_VCOUNT * HANGUL_TCOUNT + HANGUL_SCOUNT = 11172 + HANGUL_SLAST = HANGUL_SBASE + HANGUL_SCOUNT + HANGUL_JAMO_FIRST = 0x1100 + HANGUL_JAMO_LAST = 0x11FF + + # All the unicode whitespace + WHITESPACE = [ + (0x0009..0x000D).to_a, # White_Space # Cc [5] .. + 0x0020, # White_Space # Zs SPACE + 0x0085, # White_Space # Cc + 0x00A0, # White_Space # Zs NO-BREAK SPACE + 0x1680, # White_Space # Zs OGHAM SPACE MARK + 0x180E, # White_Space # Zs MONGOLIAN VOWEL SEPARATOR + (0x2000..0x200A).to_a, # White_Space # Zs [11] EN QUAD..HAIR SPACE + 0x2028, # White_Space # Zl LINE SEPARATOR + 0x2029, # White_Space # Zp PARAGRAPH SEPARATOR + 0x202F, # White_Space # Zs NARROW NO-BREAK SPACE + 0x205F, # White_Space # Zs MEDIUM MATHEMATICAL SPACE + 0x3000, # White_Space # Zs IDEOGRAPHIC SPACE + ].flatten.freeze + + # BOM (byte order mark) can also be seen as whitespace, it's a non-rendering character used to distinguish + # between little and big endian. This is not an issue in utf-8, so it must be ignored. + LEADERS_AND_TRAILERS = WHITESPACE + [65279] # ZERO-WIDTH NO-BREAK SPACE aka BOM + + # Returns a regular expression pattern that matches the passed Unicode codepoints + def self.codepoints_to_pattern(array_of_codepoints) #:nodoc: + array_of_codepoints.collect{ |e| [e].pack 'U*' }.join('|') + end + TRAILERS_PAT = /(#{codepoints_to_pattern(LEADERS_AND_TRAILERS)})+\Z/u + LEADERS_PAT = /\A(#{codepoints_to_pattern(LEADERS_AND_TRAILERS)})+/u + + # Unpack the string at codepoints boundaries. Raises an EncodingError when the encoding of the string isn't + # valid UTF-8. + # + # Example: + # Unicode.u_unpack('Café') #=> [67, 97, 102, 233] + def u_unpack(string) + begin + string.unpack 'U*' + rescue ArgumentError + raise EncodingError, 'malformed UTF-8 character' + end + end + + # Detect whether the codepoint is in a certain character class. Returns +true+ when it's in the specified + # character class and +false+ otherwise. Valid character classes are: :cr, :lf, :l, + # :v, :lv, :lvt and :t. + # + # Primarily used by the grapheme cluster support. + def in_char_class?(codepoint, classes) + classes.detect { |c| database.boundary[c] === codepoint } ? true : false + end + + # Unpack the string at grapheme boundaries. Returns a list of character lists. + # + # Example: + # Unicode.g_unpack('क्षि') #=> [[2325, 2381], [2359], [2367]] + # Unicode.g_unpack('Café') #=> [[67], [97], [102], [233]] + def g_unpack(string) + codepoints = u_unpack(string) + unpacked = [] + pos = 0 + marker = 0 + eoc = codepoints.length + while(pos < eoc) + pos += 1 + previous = codepoints[pos-1] + current = codepoints[pos] + if ( + # CR X LF + one = ( previous == database.boundary[:cr] and current == database.boundary[:lf] ) or + # L X (L|V|LV|LVT) + two = ( database.boundary[:l] === previous and in_char_class?(current, [:l,:v,:lv,:lvt]) ) or + # (LV|V) X (V|T) + three = ( in_char_class?(previous, [:lv,:v]) and in_char_class?(current, [:v,:t]) ) or + # (LVT|T) X (T) + four = ( in_char_class?(previous, [:lvt,:t]) and database.boundary[:t] === current ) or + # X Extend + five = (database.boundary[:extend] === current) + ) + else + unpacked << codepoints[marker..pos-1] + marker = pos + end + end + unpacked + end + + # Reverse operation of g_unpack. + # + # Example: + # Unicode.g_pack(Unicode.g_unpack('क्षि')) #=> 'क्षि' + def g_pack(unpacked) + (unpacked.flatten).pack('U*') + end + + # Re-order codepoints so the string becomes canonical. + def reorder_characters(codepoints) + length = codepoints.length- 1 + pos = 0 + while pos < length do + cp1, cp2 = database.codepoints[codepoints[pos]], database.codepoints[codepoints[pos+1]] + if (cp1.combining_class > cp2.combining_class) && (cp2.combining_class > 0) + codepoints[pos..pos+1] = cp2.code, cp1.code + pos += (pos > 0 ? -1 : 1) + else + pos += 1 + end + end + codepoints + end + + # Decompose composed characters to the decomposed form. + def decompose_codepoints(type, codepoints) + codepoints.inject([]) do |decomposed, cp| + # if it's a hangul syllable starter character + if HANGUL_SBASE <= cp and cp < HANGUL_SLAST + sindex = cp - HANGUL_SBASE + ncp = [] # new codepoints + ncp << HANGUL_LBASE + sindex / HANGUL_NCOUNT + ncp << HANGUL_VBASE + (sindex % HANGUL_NCOUNT) / HANGUL_TCOUNT + tindex = sindex % HANGUL_TCOUNT + ncp << (HANGUL_TBASE + tindex) unless tindex == 0 + decomposed.concat ncp + # if the codepoint is decomposable in with the current decomposition type + elsif (ncp = database.codepoints[cp].decomp_mapping) and (!database.codepoints[cp].decomp_type || type == :compatability) + decomposed.concat decompose_codepoints(type, ncp.dup) + else + decomposed << cp + end + end + end + + # Compose decomposed characters to the composed form. + def compose_codepoints(codepoints) + pos = 0 + eoa = codepoints.length - 1 + starter_pos = 0 + starter_char = codepoints[0] + previous_combining_class = -1 + while pos < eoa + pos += 1 + lindex = starter_char - HANGUL_LBASE + # -- Hangul + if 0 <= lindex and lindex < HANGUL_LCOUNT + vindex = codepoints[starter_pos+1] - HANGUL_VBASE rescue vindex = -1 + if 0 <= vindex and vindex < HANGUL_VCOUNT + tindex = codepoints[starter_pos+2] - HANGUL_TBASE rescue tindex = -1 + if 0 <= tindex and tindex < HANGUL_TCOUNT + j = starter_pos + 2 + eoa -= 2 + else + tindex = 0 + j = starter_pos + 1 + eoa -= 1 + end + codepoints[starter_pos..j] = (lindex * HANGUL_VCOUNT + vindex) * HANGUL_TCOUNT + tindex + HANGUL_SBASE + end + starter_pos += 1 + starter_char = codepoints[starter_pos] + # -- Other characters + else + current_char = codepoints[pos] + current = database.codepoints[current_char] + if current.combining_class > previous_combining_class + if ref = database.composition_map[starter_char] + composition = ref[current_char] + else + composition = nil + end + unless composition.nil? + codepoints[starter_pos] = composition + starter_char = composition + codepoints.delete_at pos + eoa -= 1 + pos -= 1 + previous_combining_class = -1 + else + previous_combining_class = current.combining_class + end + else + previous_combining_class = current.combining_class + end + if current.combining_class == 0 + starter_pos = pos + starter_char = codepoints[pos] + end + end + end + codepoints + end + + # Replaces all ISO-8859-1 or CP1252 characters by their UTF-8 equivalent resulting in a valid UTF-8 string. + # + # Passing +true+ will forcibly tidy all bytes, assuming that the string's encoding is entirely CP1252 or ISO-8859-1. + def tidy_bytes(string, force = false) + if force + return string.unpack("C*").map do |b| + tidy_byte(b) + end.flatten.compact.pack("C*").unpack("U*").pack("U*") + end + + bytes = string.unpack("C*") + conts_expected = 0 + last_lead = 0 + + bytes.each_index do |i| + + byte = bytes[i] + is_ascii = byte < 128 + is_cont = byte > 127 && byte < 192 + is_lead = byte > 191 && byte < 245 + is_unused = byte > 240 + is_restricted = byte > 244 + + # Impossible or highly unlikely byte? Clean it. + if is_unused || is_restricted + bytes[i] = tidy_byte(byte) + elsif is_cont + # Not expecting contination byte? Clean up. Otherwise, now expect one less. + conts_expected == 0 ? bytes[i] = tidy_byte(byte) : conts_expected -= 1 + else + if conts_expected > 0 + # Expected continuation, but got ASCII or leading? Clean backwards up to + # the leading byte. + (1..(i - last_lead)).each {|j| bytes[i - j] = tidy_byte(bytes[i - j])} + conts_expected = 0 + end + if is_lead + # Final byte is leading? Clean it. + if i == bytes.length - 1 + bytes[i] = tidy_byte(bytes.last) + else + # Valid leading byte? Expect continuations determined by position of + # first zero bit, with max of 3. + conts_expected = byte < 224 ? 1 : byte < 240 ? 2 : 3 + last_lead = i + end + end + end + end + bytes.empty? ? "" : bytes.flatten.compact.pack("C*").unpack("U*").pack("U*") + end + + # Returns the KC normalization of the string by default. NFKC is considered the best normalization form for + # passing strings to databases and validations. + # + # * string - The string to perform normalization on. + # * form - The form you want to normalize in. Should be one of the following: + # :c, :kc, :d, or :kd. Default is + # ActiveSupport::Multibyte.default_normalization_form + def normalize(string, form=nil) + form ||= @default_normalization_form + # See http://www.unicode.org/reports/tr15, Table 1 + codepoints = u_unpack(string) + case form + when :d + reorder_characters(decompose_codepoints(:canonical, codepoints)) + when :c + compose_codepoints(reorder_characters(decompose_codepoints(:canonical, codepoints))) + when :kd + reorder_characters(decompose_codepoints(:compatability, codepoints)) + when :kc + compose_codepoints(reorder_characters(decompose_codepoints(:compatability, codepoints))) + else + raise ArgumentError, "#{form} is not a valid normalization variant", caller + end.pack('U*') + end + + def apply_mapping(string, mapping) #:nodoc: + u_unpack(string).map do |codepoint| + cp = database.codepoints[codepoint] + if cp and (ncp = cp.send(mapping)) and ncp > 0 + ncp + else + codepoint + end + end.pack('U*') + end + + # Holds data about a codepoint in the Unicode database + class Codepoint + attr_accessor :code, :combining_class, :decomp_type, :decomp_mapping, :uppercase_mapping, :lowercase_mapping + end + + # Holds static data from the Unicode database + class UnicodeDatabase + ATTRIBUTES = :codepoints, :composition_exclusion, :composition_map, :boundary, :cp1252 + + attr_writer(*ATTRIBUTES) + + def initialize + @codepoints = Hash.new(Codepoint.new) + @composition_exclusion = [] + @composition_map = {} + @boundary = {} + @cp1252 = {} + end + + # Lazy load the Unicode database so it's only loaded when it's actually used + ATTRIBUTES.each do |attr_name| + class_eval(<<-EOS, __FILE__, __LINE__ + 1) + def #{attr_name} # def codepoints + load # load + @#{attr_name} # @codepoints + end # end + EOS + end + + # Loads the Unicode database and returns all the internal objects of UnicodeDatabase. + def load + begin + @codepoints, @composition_exclusion, @composition_map, @boundary, @cp1252 = File.open(self.class.filename, 'rb') { |f| Marshal.load f.read } + rescue Exception => e + raise IOError.new("Couldn't load the Unicode tables for UTF8Handler (#{e.message}), ActiveSupport::Multibyte is unusable") + end + + # Redefine the === method so we can write shorter rules for grapheme cluster breaks + @boundary.each do |k,_| + @boundary[k].instance_eval do + def ===(other) + detect { |i| i === other } ? true : false + end + end if @boundary[k].kind_of?(Array) + end + + # define attr_reader methods for the instance variables + class << self + attr_reader(*ATTRIBUTES) + end + end + + # Returns the directory in which the data files are stored + def self.dirname + File.dirname(__FILE__) + '/../values/' + end + + # Returns the filename for the data file for this version + def self.filename + File.expand_path File.join(dirname, "unicode_tables.dat") + end + end + + private + + def tidy_byte(byte) + if byte < 160 + [database.cp1252[byte] || byte].pack("U").unpack("C*") + elsif byte < 192 + [194, byte] + else + [195, byte - 64] + end + end + + def database + @database ||= UnicodeDatabase.new + end + + end + end +end diff --git a/activesupport/lib/active_support/multibyte/unicode_database.rb b/activesupport/lib/active_support/multibyte/unicode_database.rb deleted file mode 100644 index 074ad8613a..0000000000 --- a/activesupport/lib/active_support/multibyte/unicode_database.rb +++ /dev/null @@ -1,71 +0,0 @@ -# encoding: utf-8 - -module ActiveSupport #:nodoc: - module Multibyte #:nodoc: - # Holds data about a codepoint in the Unicode database - class Codepoint - attr_accessor :code, :combining_class, :decomp_type, :decomp_mapping, :uppercase_mapping, :lowercase_mapping - end - - # Holds static data from the Unicode database - class UnicodeDatabase - ATTRIBUTES = :codepoints, :composition_exclusion, :composition_map, :boundary, :cp1252 - - attr_writer(*ATTRIBUTES) - - def initialize - @codepoints = Hash.new(Codepoint.new) - @composition_exclusion = [] - @composition_map = {} - @boundary = {} - @cp1252 = {} - end - - # Lazy load the Unicode database so it's only loaded when it's actually used - ATTRIBUTES.each do |attr_name| - class_eval(<<-EOS, __FILE__, __LINE__ + 1) - def #{attr_name} # def codepoints - load # load - @#{attr_name} # @codepoints - end # end - EOS - end - - # Loads the Unicode database and returns all the internal objects of UnicodeDatabase. - def load - begin - @codepoints, @composition_exclusion, @composition_map, @boundary, @cp1252 = File.open(self.class.filename, 'rb') { |f| Marshal.load f.read } - rescue Exception => e - raise IOError.new("Couldn't load the Unicode tables for UTF8Handler (#{e.message}), ActiveSupport::Multibyte is unusable") - end - - # Redefine the === method so we can write shorter rules for grapheme cluster breaks - @boundary.each do |k,_| - @boundary[k].instance_eval do - def ===(other) - detect { |i| i === other } ? true : false - end - end if @boundary[k].kind_of?(Array) - end - - # define attr_reader methods for the instance variables - class << self - attr_reader(*ATTRIBUTES) - end - end - - # Returns the directory in which the data files are stored - def self.dirname - File.dirname(__FILE__) + '/../values/' - end - - # Returns the filename for the data file for this version - def self.filename - File.expand_path File.join(dirname, "unicode_tables.dat") - end - end - - # UniCode Database - UCD = UnicodeDatabase.new - end -end \ No newline at end of file diff --git a/activesupport/lib/active_support/values/unicode_tables.dat b/activesupport/lib/active_support/values/unicode_tables.dat index 74b333d416..10f2cae465 100644 Binary files a/activesupport/lib/active_support/values/unicode_tables.dat and b/activesupport/lib/active_support/values/unicode_tables.dat differ -- cgit v1.2.3