Merge pull request #13495 from norman/tidy_bytes

Use String#scrub when available to tidy bytes
author: Jeremy Kemper <jeremy@bitsweat.net> 2013-12-26 12:14:30 -0800
committer: Jeremy Kemper <jeremy@bitsweat.net> 2013-12-26 12:14:30 -0800
commit: 037665cbe1f7705afaeb924e34eeb0d93f60b290 (patch)
tree: a821d4b438106f2bbe5f58d2995f48f09753f39d
parent: cef8c8f772cf7a853a1c447f02baecfdde9fa237 (diff)
parent: ab195841ddc7302ca6e6fc4a5962bc5ab3b8c09b (diff)
download: rails-037665cbe1f7705afaeb924e34eeb0d93f60b290.tar.gz
rails-037665cbe1f7705afaeb924e34eeb0d93f60b290.tar.bz2
rails-037665cbe1f7705afaeb924e34eeb0d93f60b290.zip
1 files changed, 35 insertions, 35 deletions
diff --git a/activesupport/lib/active_support/multibyte/unicode.rb b/activesupport/lib/active_support/multibyte/unicode.rb
index 1845c6ae38..5fd410fbe1 100644
--- a/activesupport/lib/active_support/multibyte/unicode.rb
+++ b/activesupport/lib/active_support/multibyte/unicode.rb
@@ -212,37 +212,43 @@ module ActiveSupport
         codepoints
       end
 
-      # Replaces all ISO-8859-1 or CP1252 characters by their UTF-8 equivalent
-      # resulting in a valid UTF-8 string.
-      #
-      # Passing +true+ will forcibly tidy all bytes, assuming that the string's
-      # encoding is entirely CP1252 or ISO-8859-1.
-      def tidy_bytes(string, force = false)
-        return string if string.empty?
-
-        if force
-          return string.encode(Encoding::UTF_8, Encoding::Windows_1252, invalid: :replace, undef: :replace)
+      # Ruby >= 2.1 has String#scrub, which is faster than the workaround used for < 2.1.
+      if RUBY_VERSION >= '2.1'
+        # Replaces all ISO-8859-1 or CP1252 characters by their UTF-8 equivalent
+        # resulting in a valid UTF-8 string.
+        #
+        # Passing +true+ will forcibly tidy all bytes, assuming that the string's
+        # encoding is entirely CP1252 or ISO-8859-1.
+        def tidy_bytes(string, force = false)
+          return string if string.empty?
+          return recode_windows1252_chars(string) if force
+          string.scrub { |bad| recode_windows1252_chars(bad) }
         end
+      else
+        def tidy_bytes(string, force = false)
+          return string if string.empty?
+          return recode_windows1252_chars(string) if force
+
+          # We can't transcode to the same format, so we choose a nearly-identical encoding.
+          # We're going to 'transcode' bytes from UTF-8 when possible, then fall back to
+          # CP1252 when we get errors. The final string will be 'converted' back to UTF-8
+          # before returning.
+          reader = Encoding::Converter.new(Encoding::UTF_8, Encoding::UTF_8_MAC)
+
+          source = string.dup
+          out = ''.force_encoding(Encoding::UTF_8_MAC)
+
+          loop do
+            reader.primitive_convert(source, out)
+            _, _, _, error_bytes, _ = reader.primitive_errinfo
+            break if error_bytes.nil?
+            out << error_bytes.encode(Encoding::UTF_8_MAC, Encoding::Windows_1252, invalid: :replace, undef: :replace)
+          end
 
-        # We can't transcode to the same format, so we choose a nearly-identical encoding.
-        # We're going to 'transcode' bytes from UTF-8 when possible, then fall back to
-        # CP1252 when we get errors. The final string will be 'converted' back to UTF-8
-        # before returning.
-        reader = Encoding::Converter.new(Encoding::UTF_8, Encoding::UTF_8_MAC)
-
-        source = string.dup
-        out = ''.force_encoding(Encoding::UTF_8_MAC)
+          reader.finish
 
-        loop do
-          reader.primitive_convert(source, out)
-          _, _, _, error_bytes, _ = reader.primitive_errinfo
-          break if error_bytes.nil?
-          out << error_bytes.encode(Encoding::UTF_8_MAC, Encoding::Windows_1252, invalid: :replace, undef: :replace)
+          out.encode!(Encoding::UTF_8)
         end
-
-        reader.finish
-
-        out.encode!(Encoding::UTF_8)
       end
 
       # Returns the KC normalization of the string by default. NFKC is
@@ -371,14 +377,8 @@ module ActiveSupport
         end.pack('U*')
       end
 
-      def tidy_byte(byte)
-        if byte < 160
-          [database.cp1252[byte] || byte].pack("U").unpack("C*")
-        elsif byte < 192
-          [194, byte]
-        else
-          [195, byte - 64]
-        end
+      def recode_windows1252_chars(string)
+        string.encode(Encoding::UTF_8, Encoding::Windows_1252, invalid: :replace, undef: :replace)
       end
 
       def database
author	Jeremy Kemper <jeremy@bitsweat.net>	2013-12-26 12:14:30 -0800
committer	Jeremy Kemper <jeremy@bitsweat.net>	2013-12-26 12:14:30 -0800
commit	037665cbe1f7705afaeb924e34eeb0d93f60b290 (patch)
tree	a821d4b438106f2bbe5f58d2995f48f09753f39d
parent	cef8c8f772cf7a853a1c447f02baecfdde9fa237 (diff)
parent	ab195841ddc7302ca6e6fc4a5962bc5ab3b8c09b (diff)
download	rails-037665cbe1f7705afaeb924e34eeb0d93f60b290.tar.gz rails-037665cbe1f7705afaeb924e34eeb0d93f60b290.tar.bz2 rails-037665cbe1f7705afaeb924e34eeb0d93f60b290.zip