2 files changed, 56 insertions, 23 deletions
diff --git a/activesupport/lib/active_support/multibyte/chars.rb b/activesupport/lib/active_support/multibyte/chars.rb
index 3c0cf9f137..707cf200b5 100644
--- a/activesupport/lib/active_support/multibyte/chars.rb
+++ b/activesupport/lib/active_support/multibyte/chars.rb
@@ -1,4 +1,3 @@
-# encoding: utf-8
 require 'active_support/json'
 require 'active_support/core_ext/string/access'
 require 'active_support/core_ext/string/behavior'
@@ -86,10 +85,20 @@ module ActiveSupport #:nodoc:
         @wrapped_string.split(*args).map { |i| self.class.new(i) }
       end
 
-      # Works like like <tt>String#slice!</tt>, but returns an instance of
-      # Chars, or nil if the string was not modified.
+      # Works like <tt>String#slice!</tt>, but returns an instance of
+      # Chars, or nil if the string was not modified. The string will not be
+      # modified if the range given is out of bounds
+      #
+      #   string = 'Welcome'
+      #   string.mb_chars.slice!(3)    # => #<ActiveSupport::Multibyte::Chars:0x000000038109b8 @wrapped_string="c">
+      #   string # => 'Welome'
+      #   string.mb_chars.slice!(0..3) # => #<ActiveSupport::Multibyte::Chars:0x00000002eb80a0 @wrapped_string="Welo">
+      #   string # => 'me'
       def slice!(*args)
-        chars(@wrapped_string.slice!(*args))
+        string_sliced = @wrapped_string.slice!(*args)
+        if string_sliced
+          chars(string_sliced)
+        end
       end
 
       # Reverses all characters in the string.
diff --git a/activesupport/lib/active_support/multibyte/unicode.rb b/activesupport/lib/active_support/multibyte/unicode.rb
index 35efebc65f..72b20fff06 100644
--- a/activesupport/lib/active_support/multibyte/unicode.rb
+++ b/activesupport/lib/active_support/multibyte/unicode.rb
@@ -1,4 +1,3 @@
-# encoding: utf-8
 module ActiveSupport
   module Multibyte
     module Unicode
@@ -11,7 +10,7 @@ module ActiveSupport
       NORMALIZATION_FORMS = [:c, :kc, :d, :kd]
 
       # The Unicode version that is supported by the implementation
-      UNICODE_VERSION = '7.0.0'
+      UNICODE_VERSION = '8.0.0'
 
       # The default normalization used for operations that require
       # normalization. It can be set to any of the normalizations
@@ -58,7 +57,7 @@ module ActiveSupport
       # Returns a regular expression pattern that matches the passed Unicode
       # codepoints.
       def self.codepoints_to_pattern(array_of_codepoints) #:nodoc:
-        array_of_codepoints.collect{ |e| [e].pack 'U*' }.join('|')
+        array_of_codepoints.collect{ |e| [e].pack 'U*'.freeze }.join('|'.freeze)
       end
       TRAILERS_PAT = /(#{codepoints_to_pattern(LEADERS_AND_TRAILERS)})+\Z/u
       LEADERS_PAT = /\A(#{codepoints_to_pattern(LEADERS_AND_TRAILERS)})+/u
@@ -88,19 +87,44 @@ module ActiveSupport
           pos += 1
           previous = codepoints[pos-1]
           current = codepoints[pos]
-          if (
-              # CR X LF
-              ( previous == database.boundary[:cr] and current == database.boundary[:lf] ) or
-              # L X (L|V|LV|LVT)
-              ( database.boundary[:l] === previous and in_char_class?(current, [:l,:v,:lv,:lvt]) ) or
-              # (LV|V) X (V|T)
-              ( in_char_class?(previous, [:lv,:v]) and in_char_class?(current, [:v,:t]) ) or
-              # (LVT|T) X (T)
-              ( in_char_class?(previous, [:lvt,:t]) and database.boundary[:t] === current ) or
-              # X Extend
-              (database.boundary[:extend] === current)
-            )
-          else
+
+          should_break =
+            # GB3. CR X LF
+            if previous == database.boundary[:cr] and current == database.boundary[:lf]
+              false
+            # GB4. (Control|CR|LF) ÷
+            elsif previous and in_char_class?(previous, [:control,:cr,:lf])
+              true
+            # GB5. ÷ (Control|CR|LF)
+            elsif in_char_class?(current, [:control,:cr,:lf])
+              true
+            # GB6. L X (L|V|LV|LVT)
+            elsif database.boundary[:l] === previous and in_char_class?(current, [:l,:v,:lv,:lvt])
+              false
+            # GB7. (LV|V) X (V|T)
+            elsif in_char_class?(previous, [:lv,:v]) and in_char_class?(current, [:v,:t])
+              false
+            # GB8. (LVT|T) X (T)
+            elsif in_char_class?(previous, [:lvt,:t]) and database.boundary[:t] === current
+              false
+            # GB8a. Regional_Indicator X Regional_Indicator
+            elsif database.boundary[:regional_indicator] === previous and database.boundary[:regional_indicator] === current
+              false
+            # GB9. X Extend
+            elsif database.boundary[:extend] === current
+              false
+            # GB9a. X SpacingMark
+            elsif database.boundary[:spacingmark] === current
+              false
+            # GB9b. Prepend X
+            elsif database.boundary[:prepend] === previous
+              false
+            # GB10. Any ÷ Any
+            else
+              true
+            end
+
+          if should_break
             unpacked << codepoints[marker..pos-1]
             marker = pos
           end
@@ -257,7 +281,7 @@ module ActiveSupport
       # * <tt>string</tt> - The string to perform normalization on.
       # * <tt>form</tt> - The form you want to normalize in. Should be one of
       #   the following: <tt>:c</tt>, <tt>:kc</tt>, <tt>:d</tt>, or <tt>:kd</tt>.
-      #   Default is ActiveSupport::Multibyte.default_normalization_form.
+      #   Default is ActiveSupport::Multibyte::Unicode.default_normalization_form.
       def normalize(string, form=nil)
         form ||= @default_normalization_form
         # See http://www.unicode.org/reports/tr15, Table 1
@@ -273,7 +297,7 @@ module ActiveSupport
             compose(reorder_characters(decompose(:compatibility, codepoints)))
           else
             raise ArgumentError, "#{form} is not a valid normalization variant", caller
-        end.pack('U*')
+        end.pack('U*'.freeze)
       end
 
       def downcase(string)
@@ -338,7 +362,7 @@ module ActiveSupport
           end
 
           # Redefine the === method so we can write shorter rules for grapheme cluster breaks
-          @boundary.each do |k,_|
+          @boundary.each_key do |k|
             @boundary[k].instance_eval do
               def ===(other)
                 detect { |i| i === other } ? true : false