diff options
Diffstat (limited to 'activesupport/lib/active_support/multibyte/unicode.rb')
-rw-r--r-- | activesupport/lib/active_support/multibyte/unicode.rb | 60 |
1 files changed, 36 insertions, 24 deletions
diff --git a/activesupport/lib/active_support/multibyte/unicode.rb b/activesupport/lib/active_support/multibyte/unicode.rb index 217919ccb8..a64223c0e0 100644 --- a/activesupport/lib/active_support/multibyte/unicode.rb +++ b/activesupport/lib/active_support/multibyte/unicode.rb @@ -1,3 +1,5 @@ +# frozen_string_literal: true + module ActiveSupport module Multibyte module Unicode @@ -9,7 +11,7 @@ module ActiveSupport NORMALIZATION_FORMS = [:c, :kc, :d, :kd] # The Unicode version that is supported by the implementation - UNICODE_VERSION = "8.0.0" + UNICODE_VERSION = "9.0.0" # The default normalization used for operations that require # normalization. It can be set to any of the normalizations @@ -52,35 +54,35 @@ module ActiveSupport pos = 0 marker = 0 eoc = codepoints.length - while(pos < eoc) + while (pos < eoc) pos += 1 - previous = codepoints[pos-1] + previous = codepoints[pos - 1] current = codepoints[pos] + # See http://unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules should_break = + if pos == eoc + true # GB3. CR X LF - if previous == database.boundary[:cr] && current == database.boundary[:lf] + elsif previous == database.boundary[:cr] && current == database.boundary[:lf] false # GB4. (Control|CR|LF) ÷ - elsif previous && in_char_class?(previous, [:control,:cr,:lf]) + elsif previous && in_char_class?(previous, [:control, :cr, :lf]) true # GB5. ÷ (Control|CR|LF) - elsif in_char_class?(current, [:control,:cr,:lf]) + elsif in_char_class?(current, [:control, :cr, :lf]) true # GB6. L X (L|V|LV|LVT) - elsif database.boundary[:l] === previous && in_char_class?(current, [:l,:v,:lv,:lvt]) + elsif database.boundary[:l] === previous && in_char_class?(current, [:l, :v, :lv, :lvt]) false # GB7. (LV|V) X (V|T) - elsif in_char_class?(previous, [:lv,:v]) && in_char_class?(current, [:v,:t]) + elsif in_char_class?(previous, [:lv, :v]) && in_char_class?(current, [:v, :t]) false # GB8. (LVT|T) X (T) - elsif in_char_class?(previous, [:lvt,:t]) && database.boundary[:t] === current - false - # GB8a. Regional_Indicator X Regional_Indicator - elsif database.boundary[:regional_indicator] === previous && database.boundary[:regional_indicator] === current + elsif in_char_class?(previous, [:lvt, :t]) && database.boundary[:t] === current false - # GB9. X Extend - elsif database.boundary[:extend] === current + # GB9. X (Extend | ZWJ) + elsif in_char_class?(current, [:extend, :zwj]) false # GB9a. X SpacingMark elsif database.boundary[:spacingmark] === current @@ -88,13 +90,23 @@ module ActiveSupport # GB9b. Prepend X elsif database.boundary[:prepend] === previous false - # GB10. Any ÷ Any + # GB10. (E_Base | EBG) Extend* X E_Modifier + elsif (marker...pos).any? { |i| in_char_class?(codepoints[i], [:e_base, :e_base_gaz]) && codepoints[i + 1...pos].all? { |c| database.boundary[:extend] === c } } && database.boundary[:e_modifier] === current + false + # GB11. ZWJ X (Glue_After_Zwj | EBG) + elsif database.boundary[:zwj] === previous && in_char_class?(current, [:glue_after_zwj, :e_base_gaz]) + false + # GB12. ^ (RI RI)* RI X RI + # GB13. [^RI] (RI RI)* RI X RI + elsif codepoints[marker..pos].all? { |c| database.boundary[:regional_indicator] === c } && codepoints[marker..pos].count { |c| database.boundary[:regional_indicator] === c }.even? + false + # GB999. Any ÷ Any else true end if should_break - unpacked << codepoints[marker..pos-1] + unpacked << codepoints[marker..pos - 1] marker = pos end end @@ -110,12 +122,12 @@ module ActiveSupport # Re-order codepoints so the string becomes canonical. def reorder_characters(codepoints) - length = codepoints.length- 1 + length = codepoints.length - 1 pos = 0 while pos < length do - cp1, cp2 = database.codepoints[codepoints[pos]], database.codepoints[codepoints[pos+1]] + cp1, cp2 = database.codepoints[codepoints[pos]], database.codepoints[codepoints[pos + 1]] if (cp1.combining_class > cp2.combining_class) && (cp2.combining_class > 0) - codepoints[pos..pos+1] = cp2.code, cp1.code + codepoints[pos..pos + 1] = cp2.code, cp1.code pos += (pos > 0 ? -1 : 1) else pos += 1 @@ -157,9 +169,9 @@ module ActiveSupport lindex = starter_char - HANGUL_LBASE # -- Hangul if 0 <= lindex && lindex < HANGUL_LCOUNT - vindex = codepoints[starter_pos+1] - HANGUL_VBASE rescue vindex = -1 + vindex = codepoints[starter_pos + 1] - HANGUL_VBASE rescue vindex = -1 if 0 <= vindex && vindex < HANGUL_VCOUNT - tindex = codepoints[starter_pos+2] - HANGUL_TBASE rescue tindex = -1 + tindex = codepoints[starter_pos + 2] - HANGUL_TBASE rescue tindex = -1 if 0 <= tindex && tindex < HANGUL_TCOUNT j = starter_pos + 2 eoa -= 2 @@ -251,7 +263,7 @@ module ActiveSupport # * <tt>form</tt> - The form you want to normalize in. Should be one of # the following: <tt>:c</tt>, <tt>:kc</tt>, <tt>:d</tt>, or <tt>:kd</tt>. # Default is ActiveSupport::Multibyte::Unicode.default_normalization_form. - def normalize(string, form=nil) + def normalize(string, form = nil) form ||= @default_normalization_form # See http://www.unicode.org/reports/tr15, Table 1 codepoints = string.codepoints.to_a @@ -347,7 +359,7 @@ module ActiveSupport # Returns the directory in which the data files are stored. def self.dirname - File.dirname(__FILE__) + "/../values/" + File.expand_path("../values", __dir__) end # Returns the filename for the data file for this version. @@ -358,7 +370,7 @@ module ActiveSupport private - def apply_mapping(string, mapping) #:nodoc: + def apply_mapping(string, mapping) database.codepoints string.each_codepoint.map do |codepoint| cp = database.codepoints[codepoint] |