1 files changed, 30 insertions, 43 deletions
diff --git a/activesupport/lib/active_support/multibyte/unicode.rb b/activesupport/lib/active_support/multibyte/unicode.rb
index f49ca47f14..1845c6ae38 100644
--- a/activesupport/lib/active_support/multibyte/unicode.rb
+++ b/activesupport/lib/active_support/multibyte/unicode.rb
@@ -11,7 +11,7 @@ module ActiveSupport
       NORMALIZATION_FORMS = [:c, :kc, :d, :kd]
 
       # The Unicode version that is supported by the implementation
-      UNICODE_VERSION = '6.1.0'
+      UNICODE_VERSION = '6.2.0'
 
       # The default normalization used for operations that require
       # normalization. It can be set to any of the normalizations
@@ -145,7 +145,7 @@ module ActiveSupport
             ncp << (HANGUL_TBASE + tindex) unless tindex == 0
             decomposed.concat ncp
           # if the codepoint is decomposable in with the current decomposition type
-          elsif (ncp = database.codepoints[cp].decomp_mapping) and (!database.codepoints[cp].decomp_type || type == :compatability)
+          elsif (ncp = database.codepoints[cp].decomp_mapping) and (!database.codepoints[cp].decomp_type || type == :compatibility)
             decomposed.concat decompose(type, ncp.dup)
           else
             decomposed << cp
@@ -218,51 +218,31 @@ module ActiveSupport
       # Passing +true+ will forcibly tidy all bytes, assuming that the string's
       # encoding is entirely CP1252 or ISO-8859-1.
       def tidy_bytes(string, force = false)
+        return string if string.empty?
+
         if force
-          return string.unpack("C*").map do |b|
-            tidy_byte(b)
-          end.flatten.compact.pack("C*").unpack("U*").pack("U*")
+          return string.encode(Encoding::UTF_8, Encoding::Windows_1252, invalid: :replace, undef: :replace)
         end
 
-        bytes = string.unpack("C*")
-        conts_expected = 0
-        last_lead = 0
-
-        bytes.each_index do |i|
+        # We can't transcode to the same format, so we choose a nearly-identical encoding.
+        # We're going to 'transcode' bytes from UTF-8 when possible, then fall back to
+        # CP1252 when we get errors. The final string will be 'converted' back to UTF-8
+        # before returning.
+        reader = Encoding::Converter.new(Encoding::UTF_8, Encoding::UTF_8_MAC)
 
-          byte          = bytes[i]
-          is_cont       = byte > 127 && byte < 192
-          is_lead       = byte > 191 && byte < 245
-          is_unused     = byte > 240
-          is_restricted = byte > 244
+        source = string.dup
+        out = ''.force_encoding(Encoding::UTF_8_MAC)
 
-          # Impossible or highly unlikely byte? Clean it.
-          if is_unused || is_restricted
-            bytes[i] = tidy_byte(byte)
-          elsif is_cont
-            # Not expecting continuation byte? Clean up. Otherwise, now expect one less.
-            conts_expected == 0 ? bytes[i] = tidy_byte(byte) : conts_expected -= 1
-          else
-            if conts_expected > 0
-              # Expected continuation, but got ASCII or leading? Clean backwards up to
-              # the leading byte.
-              (1..(i - last_lead)).each {|j| bytes[i - j] = tidy_byte(bytes[i - j])}
-              conts_expected = 0
-            end
-            if is_lead
-              # Final byte is leading? Clean it.
-              if i == bytes.length - 1
-                bytes[i] = tidy_byte(bytes.last)
-              else
-                # Valid leading byte? Expect continuations determined by position of
-                # first zero bit, with max of 3.
-                conts_expected = byte < 224 ? 1 : byte < 240 ? 2 : 3
-                last_lead = i
-              end
-            end
-          end
+        loop do
+          reader.primitive_convert(source, out)
+          _, _, _, error_bytes, _ = reader.primitive_errinfo
+          break if error_bytes.nil?
+          out << error_bytes.encode(Encoding::UTF_8_MAC, Encoding::Windows_1252, invalid: :replace, undef: :replace)
         end
-        bytes.empty? ? "" : bytes.flatten.compact.pack("C*").unpack("U*").pack("U*")
+
+        reader.finish
+
+        out.encode!(Encoding::UTF_8)
       end
 
       # Returns the KC normalization of the string by default. NFKC is
@@ -283,9 +263,9 @@ module ActiveSupport
           when :c
             compose(reorder_characters(decompose(:canonical, codepoints)))
           when :kd
-            reorder_characters(decompose(:compatability, codepoints))
+            reorder_characters(decompose(:compatibility, codepoints))
           when :kc
-            compose(reorder_characters(decompose(:compatability, codepoints)))
+            compose(reorder_characters(decompose(:compatibility, codepoints)))
           else
             raise ArgumentError, "#{form} is not a valid normalization variant", caller
         end.pack('U*')
@@ -307,6 +287,13 @@ module ActiveSupport
       class Codepoint
         attr_accessor :code, :combining_class, :decomp_type, :decomp_mapping, :uppercase_mapping, :lowercase_mapping
 
+        # Initializing Codepoint object with default values
+        def initialize
+          @combining_class = 0
+          @uppercase_mapping = 0
+          @lowercase_mapping = 0
+        end
+
         def swapcase_mapping
           uppercase_mapping > 0 ? uppercase_mapping : lowercase_mapping
         end