diff options
Diffstat (limited to 'activesupport/lib/active_support/multibyte/chars.rb')
-rw-r--r-- | activesupport/lib/active_support/multibyte/chars.rb | 45 |
1 files changed, 27 insertions, 18 deletions
diff --git a/activesupport/lib/active_support/multibyte/chars.rb b/activesupport/lib/active_support/multibyte/chars.rb index 4ade1158fd..cca30d1141 100644 --- a/activesupport/lib/active_support/multibyte/chars.rb +++ b/activesupport/lib/active_support/multibyte/chars.rb @@ -75,8 +75,6 @@ module ActiveSupport #:nodoc: UNICODE_TRAILERS_PAT = /(#{codepoints_to_pattern(UNICODE_LEADERS_AND_TRAILERS)})+\Z/u UNICODE_LEADERS_PAT = /\A(#{codepoints_to_pattern(UNICODE_LEADERS_AND_TRAILERS)})+/u - UTF8_PAT = ActiveSupport::Multibyte::VALID_CHARACTER['UTF-8'] - attr_reader :wrapped_string alias to_s wrapped_string alias to_str wrapped_string @@ -409,25 +407,11 @@ module ActiveSupport #:nodoc: # Returns the KC normalization of the string by default. NFKC is considered the best normalization form for # passing strings to databases and validations. # - # * <tt>str</tt> - The string to perform normalization on. # * <tt>form</tt> - The form you want to normalize in. Should be one of the following: # <tt>:c</tt>, <tt>:kc</tt>, <tt>:d</tt>, or <tt>:kd</tt>. Default is # ActiveSupport::Multibyte.default_normalization_form def normalize(form=ActiveSupport::Multibyte.default_normalization_form) - # See http://www.unicode.org/reports/tr15, Table 1 - codepoints = self.class.u_unpack(@wrapped_string) - chars(case form - when :d - self.class.reorder_characters(self.class.decompose_codepoints(:canonical, codepoints)) - when :c - self.class.compose_codepoints(self.class.reorder_characters(self.class.decompose_codepoints(:canonical, codepoints))) - when :kd - self.class.reorder_characters(self.class.decompose_codepoints(:compatability, codepoints)) - when :kc - self.class.compose_codepoints(self.class.reorder_characters(self.class.decompose_codepoints(:compatability, codepoints))) - else - raise ArgumentError, "#{form} is not a valid normalization variant", caller - end.pack('U*')) + chars(self.class.normalize(@wrapped_string, form)) end # Performs canonical decomposition on all the characters. @@ -659,7 +643,7 @@ module ActiveSupport #:nodoc: # Replaces all ISO-8859-1 or CP1252 characters by their UTF-8 equivalent resulting in a valid UTF-8 string. # - # Passing +true+ will forcibly tidy all bytes, assuming that the string's encoding is entirely CP-1252 or ISO-8859-1. + # Passing +true+ will forcibly tidy all bytes, assuming that the string's encoding is entirely CP1252 or ISO-8859-1. def tidy_bytes(string, force = false) if force return string.unpack("C*").map do |b| @@ -708,6 +692,31 @@ module ActiveSupport #:nodoc: end bytes.empty? ? "" : bytes.flatten.compact.pack("C*").unpack("U*").pack("U*") end + + # Returns the KC normalization of the string by default. NFKC is considered the best normalization form for + # passing strings to databases and validations. + # + # * <tt>string</tt> - The string to perform normalization on. + # * <tt>form</tt> - The form you want to normalize in. Should be one of the following: + # <tt>:c</tt>, <tt>:kc</tt>, <tt>:d</tt>, or <tt>:kd</tt>. Default is + # ActiveSupport::Multibyte.default_normalization_form + def normalize(string, form=ActiveSupport::Multibyte.default_normalization_form) + # See http://www.unicode.org/reports/tr15, Table 1 + codepoints = u_unpack(string) + case form + when :d + reorder_characters(decompose_codepoints(:canonical, codepoints)) + when :c + compose_codepoints(reorder_characters(decompose_codepoints(:canonical, codepoints))) + when :kd + reorder_characters(decompose_codepoints(:compatability, codepoints)) + when :kc + compose_codepoints(reorder_characters(decompose_codepoints(:compatability, codepoints))) + else + raise ArgumentError, "#{form} is not a valid normalization variant", caller + end.pack('U*') + end + end protected |